85 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/types.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/pkt_sched.h> #include <net/caif/caif_layer.h> #include <net/caif/cfsrvl.h> #include <net/caif/cfpkt.h> #include <net/caif/caif_dev.h> #define SRVL_CTRL_PKT_SIZE 1 #define SRVL_FLOW_OFF 0x81 #define SRVL_FLOW_ON 0x80 #define SRVL_SET_PIN 0x82 #define container_obj(layr) container_of(layr, struct cfsrvl, layer) static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { struct cfsrvl *service = container_obj(layr); if (layr->up == NULL || layr->up->ctrlcmd == NULL) return; switch (ctrl) { case CAIF_CTRLCMD_INIT_RSP: service->open = true; layr->up->ctrlcmd(layr->up, ctrl, phyid); break; case CAIF_CTRLCMD_DEINIT_RSP: case CAIF_CTRLCMD_INIT_FAIL_RSP: service->open = false; layr->up->ctrlcmd(layr->up, ctrl, phyid); break; case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND: if (phyid != service->dev_info.id) break; if (service->modem_flow_on) layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_OFF_IND, phyid); service->phy_flow_on = false; break; case _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND: if (phyid != service->dev_info.id) return; if (service->modem_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_ON_IND, phyid); } service->phy_flow_on = true; break; case CAIF_CTRLCMD_FLOW_OFF_IND: if (service->phy_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_OFF_IND, phyid); } service->modem_flow_on = false; break; case CAIF_CTRLCMD_FLOW_ON_IND: if (service->phy_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_ON_IND, phyid); } service->modem_flow_on = true; break; case _CAIF_CTRLCMD_PHYIF_DOWN_IND: /* In case interface is down, let's fake a remove shutdown */ layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, phyid); break; case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: layr->up->ctrlcmd(layr->up, ctrl, phyid); break; default: pr_warn("Unexpected ctrl in cfsrvl (%d)\n", ctrl); /* We have both modem and phy flow on, send flow on */ layr->up->ctrlcmd(layr->up, ctrl, phyid); service->phy_flow_on = true; break; } } static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl) { struct cfsrvl *service = container_obj(layr); caif_assert(layr != NULL); caif_assert(layr->dn != NULL); caif_assert(layr->dn->transmit != NULL); if (!service->supports_flowctrl) return 0; switch (ctrl) { case CAIF_MODEMCMD_FLOW_ON_REQ: { struct cfpkt *pkt; struct caif_payload_info *info; u8 flow_on = SRVL_FLOW_ON; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) return -ENOMEM; if (cfpkt_add_head(pkt, &flow_on, 1) < 0) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } info = cfpkt_info(pkt); info->channel_id = service->layer.id; info->hdr_len = 1; info->dev_info = &service->dev_info; cfpkt_set_prio(pkt, TC_PRIO_CONTROL); return layr->dn->transmit(layr->dn, pkt); } case CAIF_MODEMCMD_FLOW_OFF_REQ: { struct cfpkt *pkt; struct caif_payload_info *info; u8 flow_off = SRVL_FLOW_OFF; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) return -ENOMEM; if (cfpkt_add_head(pkt, &flow_off, 1) < 0) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } info = cfpkt_info(pkt); info->channel_id = service->layer.id; info->hdr_len = 1; info->dev_info = &service->dev_info; cfpkt_set_prio(pkt, TC_PRIO_CONTROL); return layr->dn->transmit(layr->dn, pkt); } default: break; } return -EINVAL; } static void cfsrvl_release(struct cflayer *layer) { struct cfsrvl *service = container_of(layer, struct cfsrvl, layer); kfree(service); } void cfsrvl_init(struct cfsrvl *service, u8 channel_id, struct dev_info *dev_info, bool supports_flowctrl) { caif_assert(offsetof(struct cfsrvl, layer) == 0); service->open = false; service->modem_flow_on = true; service->phy_flow_on = true; service->layer.id = channel_id; service->layer.ctrlcmd = cfservl_ctrlcmd; service->layer.modemcmd = cfservl_modemcmd; service->dev_info = *dev_info; service->supports_flowctrl = supports_flowctrl; service->release = cfsrvl_release; } bool cfsrvl_ready(struct cfsrvl *service, int *err) { if (!service->open) { *err = -ENOTCONN; return false; } return true; } u8 cfsrvl_getphyid(struct cflayer *layer) { struct cfsrvl *servl = container_obj(layer); return servl->dev_info.id; } bool cfsrvl_phyid_match(struct cflayer *layer, int phyid) { struct cfsrvl *servl = container_obj(layer); return servl->dev_info.id == phyid; } void caif_free_client(struct cflayer *adap_layer) { struct cfsrvl *servl; if (adap_layer == NULL || adap_layer->dn == NULL) return; servl = container_obj(adap_layer->dn); servl->release(&servl->layer); } EXPORT_SYMBOL(caif_free_client); void caif_client_register_refcnt(struct cflayer *adapt_layer, void (*hold)(struct cflayer *lyr), void (*put)(struct cflayer *lyr)) { struct cfsrvl *service; if (WARN_ON(adapt_layer == NULL || adapt_layer->dn == NULL)) return; service = container_of(adapt_layer->dn, struct cfsrvl, layer); service->hold = hold; service->put = put; } EXPORT_SYMBOL(caif_client_register_refcnt); |
5 64 141 141 140 141 30 110 161 4 157 157 157 152 5 5 5 5 5 75 70 22 42 2 6 4 4 3 1 24 24 24 24 24 24 757 735 23 23 23 23 145 137 4 3 3 3 3 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 | // SPDX-License-Identifier: GPL-2.0-or-later /* * lwtunnel Infrastructure for light weight tunnels like mpls * * Authors: Roopa Prabhu, <roopa@cumulusnetworks.com> */ #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/lwtunnel.h> #include <linux/in.h> #include <linux/init.h> #include <linux/err.h> #include <net/lwtunnel.h> #include <net/rtnetlink.h> #include <net/ip6_fib.h> #include <net/rtnh.h> DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled); #ifdef CONFIG_MODULES static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) { /* Only lwt encaps implemented without using an interface for * the encap need to return a string here. */ switch (encap_type) { case LWTUNNEL_ENCAP_MPLS: return "MPLS"; case LWTUNNEL_ENCAP_ILA: return "ILA"; case LWTUNNEL_ENCAP_SEG6: return "SEG6"; case LWTUNNEL_ENCAP_BPF: return "BPF"; case LWTUNNEL_ENCAP_SEG6_LOCAL: return "SEG6LOCAL"; case LWTUNNEL_ENCAP_RPL: return "RPL"; case LWTUNNEL_ENCAP_IOAM6: return "IOAM6"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: case __LWTUNNEL_ENCAP_MAX: /* should not have got here */ WARN_ON(1); break; } return NULL; } #endif /* CONFIG_MODULES */ struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) { struct lwtunnel_state *lws; lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC); return lws; } EXPORT_SYMBOL_GPL(lwtunnel_state_alloc); static const struct lwtunnel_encap_ops __rcu * lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly; int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, unsigned int num) { if (num > LWTUNNEL_ENCAP_MAX) return -ERANGE; return !cmpxchg((const struct lwtunnel_encap_ops **) &lwtun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL_GPL(lwtunnel_encap_add_ops); int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, unsigned int encap_type) { int ret; if (encap_type == LWTUNNEL_ENCAP_NONE || encap_type > LWTUNNEL_ENCAP_MAX) return -ERANGE; ret = (cmpxchg((const struct lwtunnel_encap_ops **) &lwtun_encaps[encap_type], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops); int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; bool found = false; int ret = -EINVAL; if (encap_type == LWTUNNEL_ENCAP_NONE || encap_type > LWTUNNEL_ENCAP_MAX) { NL_SET_ERR_MSG_ATTR(extack, encap, "Unknown LWT encapsulation type"); return ret; } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); if (likely(ops && ops->build_state && try_module_get(ops->owner))) found = true; rcu_read_unlock(); if (found) { ret = ops->build_state(net, encap, family, cfg, lws, extack); if (ret) module_put(ops->owner); } else { /* don't rely on -EOPNOTSUPP to detect match as build_state * handlers could return it */ NL_SET_ERR_MSG_ATTR(extack, encap, "LWT encapsulation type not supported"); } return ret; } EXPORT_SYMBOL_GPL(lwtunnel_build_state); int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; int ret = -EINVAL; if (encap_type == LWTUNNEL_ENCAP_NONE || encap_type > LWTUNNEL_ENCAP_MAX) { NL_SET_ERR_MSG(extack, "Unknown lwt encapsulation type"); return ret; } rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); rcu_read_unlock(); #ifdef CONFIG_MODULES if (!ops) { const char *encap_type_str = lwtunnel_encap_str(encap_type); if (encap_type_str) { __rtnl_unlock(); request_module("rtnl-lwt-%s", encap_type_str); rtnl_lock(); rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); rcu_read_unlock(); } } #endif ret = ops ? 0 : -EOPNOTSUPP; if (ret < 0) NL_SET_ERR_MSG(extack, "lwt encapsulation type not supported"); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, struct netlink_ext_ack *extack) { struct rtnexthop *rtnh = (struct rtnexthop *)attr; struct nlattr *nla_entype; struct nlattr *attrs; u16 encap_type; int attrlen; while (rtnh_ok(rtnh, remaining)) { attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { attrs = rtnh_attrs(rtnh); nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla_entype) { if (nla_len(nla_entype) < sizeof(u16)) { NL_SET_ERR_MSG(extack, "Invalid RTA_ENCAP_TYPE"); return -EINVAL; } encap_type = nla_get_u16(nla_entype); if (lwtunnel_valid_encap_type(encap_type, extack) != 0) return -EOPNOTSUPP; } } rtnh = rtnh_next(rtnh, &remaining); } return 0; } EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type_attr); void lwtstate_free(struct lwtunnel_state *lws) { const struct lwtunnel_encap_ops *ops = lwtun_encaps[lws->type]; if (ops->destroy_state) { ops->destroy_state(lws); kfree_rcu(lws, rcu); } else { kfree(lws); } module_put(ops->owner); } EXPORT_SYMBOL_GPL(lwtstate_free); int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, int encap_attr, int encap_type_attr) { const struct lwtunnel_encap_ops *ops; struct nlattr *nest; int ret; if (!lwtstate) return 0; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; nest = nla_nest_start_noflag(skb, encap_attr); if (!nest) return -EMSGSIZE; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->fill_encap)) ret = ops->fill_encap(skb, lwtstate); rcu_read_unlock(); if (ret) goto nla_put_failure; nla_nest_end(skb, nest); ret = nla_put_u16(skb, encap_type_attr, lwtstate->type); if (ret) goto nla_put_failure; return 0; nla_put_failure: nla_nest_cancel(skb, nest); return (ret == -EOPNOTSUPP ? 0 : ret); } EXPORT_SYMBOL_GPL(lwtunnel_fill_encap); int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) { const struct lwtunnel_encap_ops *ops; int ret = 0; if (!lwtstate) return 0; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->get_encap_size)) ret = nla_total_size(ops->get_encap_size(lwtstate)); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_get_encap_size); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { const struct lwtunnel_encap_ops *ops; int ret = 0; if (!a && !b) return 0; if (!a || !b) return 1; if (a->type != b->type) return 1; if (a->type == LWTUNNEL_ENCAP_NONE || a->type > LWTUNNEL_ENCAP_MAX) return 0; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[a->type]); if (likely(ops && ops->cmp_encap)) ret = ops->cmp_encap(a, b); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; int ret = -EINVAL; if (!dst) goto drop; lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->output)) ret = ops->output(net, sk, skb); rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; return ret; drop: kfree_skb(skb); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_output); int lwtunnel_xmit(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; int ret = -EINVAL; if (!dst) goto drop; lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->xmit)) ret = ops->xmit(skb); rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; return ret; drop: kfree_skb(skb); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_xmit); int lwtunnel_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; int ret = -EINVAL; if (!dst) goto drop; lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->input)) ret = ops->input(skb); rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; return ret; drop: kfree_skb(skb); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_input); |
384 382 153 228 187 447 5 57 232 202 464 258 257 258 258 222 177 48 8 66 33 88 4 106 103 70 10 59 6 106 105 8 98 91 88 4 54 176 150 33 33 33 152 84 82 13 5 9 83 83 82 5 82 5 134 104 378 378 378 272 104 377 272 274 19 1 2 2 1 2 4 159 160 137 22 22 145 17 145 38 38 408 189 73 6 79 193 389 137 271 274 274 274 272 1 273 274 273 274 273 211 84 134 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic INET transport hashtables * * Authors: Lotsa people, from code originally in tcp */ #include <linux/module.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/vmalloc.h> #include <linux/memblock.h> #include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/inet6_hashtables.h> #endif #include <net/secure_seq.h> #include <net/ip.h> #include <net/tcp.h> #include <net/sock_reuseport.h> u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport) { static u32 inet_ehash_secret __read_mostly; net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); return __inet_ehashfn(laddr, lport, faddr, fport, inet_ehash_secret + net_hash_mix(net)); } EXPORT_SYMBOL_GPL(inet_ehashfn); /* This function handles inet_sock, but also timewait and request sockets * for IPv4/IPv6. */ static u32 sk_ehashfn(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6 && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) return inet6_ehashfn(sock_net(sk), &sk->sk_v6_rcv_saddr, sk->sk_num, &sk->sk_v6_daddr, sk->sk_dport); #endif return inet_ehashfn(sock_net(sk), sk->sk_rcv_saddr, sk->sk_num, sk->sk_daddr, sk->sk_dport); } /* * Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. */ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev) { struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb) { write_pnet(&tb->ib_net, net); tb->l3mdev = l3mdev; tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; INIT_HLIST_HEAD(&tb->owners); hlist_add_head(&tb->node, &head->chain); } return tb; } /* * Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) { if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); kmem_cache_free(cachep, tb); } } void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum) { inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &tb->owners); inet_csk(sk)->icsk_bind_hash = tb; } /* * Get rid of any references to a local port held by the given sock. */ static void __inet_put_port(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, hashinfo->bhash_size); struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; __sk_del_bind_node(sk); inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); spin_unlock(&head->lock); } void inet_put_port(struct sock *sk) { local_bh_disable(); __inet_put_port(sk); local_bh_enable(); } EXPORT_SYMBOL(inet_put_port); int __inet_inherit_port(const struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; unsigned short port = inet_sk(child)->inet_num; const int bhash = inet_bhashfn(sock_net(sk), port, table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_bucket *tb; int l3mdev; spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; if (unlikely(!tb)) { spin_unlock(&head->lock); return -ENOENT; } if (tb->port != port) { l3mdev = inet_sk_bound_l3mdev(sk); /* NOTE: using tproxy and redirecting skbs to a proxy * on a different listener port breaks the assumption * that the listener socket's icsk_bind_hash is the same * as that of the child socket. We have to look up or * create a new bind bucket for the child here. */ inet_bind_bucket_for_each(tb, &head->chain) { if (net_eq(ib_net(tb), sock_net(sk)) && tb->l3mdev == l3mdev && tb->port == port) break; } if (!tb) { tb = inet_bind_bucket_create(table->bind_bucket_cachep, sock_net(sk), head, port, l3mdev); if (!tb) { spin_unlock(&head->lock); return -ENOMEM; } } inet_csk_update_fastreuse(tb, child); } inet_bind_hash(child, tb, port); spin_unlock(&head->lock); return 0; } EXPORT_SYMBOL_GPL(__inet_inherit_port); static struct inet_listen_hashbucket * inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) { u32 hash; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); else #endif hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); return inet_lhash2_bucket(h, hash); } static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, const int dif, const int sdif) { int score = -1; if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && !ipv6_only_sock(sk)) { if (sk->sk_rcv_saddr != daddr) return -1; if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; score = sk->sk_bound_dev_if ? 2 : 1; if (sk->sk_family == PF_INET) score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } return score; } INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn); struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, inet_ehashfn_t *ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn, net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; } EXPORT_SYMBOL_GPL(inet_lookup_reuseport); /* * Here are some nice properties to exploit here. The BSD API * does not allow a listening sock to specify the remote port nor the * remote address for the connection. So always assume those are both * wildcarded during the search since they can never be otherwise. */ /* called with rcu_read_lock() : No refcount taken on the socket */ static struct sock *inet_lhash2_lookup(struct net *net, struct inet_listen_hashbucket *ilb2, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { struct sock *sk, *result = NULL; struct hlist_nulls_node *node; int score, hiscore = 0; sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, inet_ehashfn); if (result) return result; result = sk; hiscore = score; } } return result; } static inline struct sock *inet_lookup_run_bpf(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, u16 hnum) { struct sock *sk, *reuse_sk; bool no_reuseport; if (hashinfo != &tcp_hashinfo) return NULL; /* only TCP is supported */ no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport, daddr, hnum, &sk); if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, inet_ehashfn); if (reuse_sk) sk = reuse_sk; return sk; } struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; struct sock *result = NULL; unsigned int hash2; /* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { result = inet_lookup_run_bpf(net, hashinfo, skb, doff, saddr, sport, daddr, hnum); if (result) goto done; } hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, daddr, hnum, dif, sdif); if (result) goto done; /* Lookup lhash2 with INADDR_ANY */ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif); done: if (IS_ERR(result)) return NULL; return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); /* All sockets share common refcount, but have different destructors */ void sock_gen_put(struct sock *sk) { if (!refcount_dec_and_test(&sk->sk_refcnt)) return; if (sk->sk_state == TCP_TIME_WAIT) inet_twsk_free(inet_twsk(sk)); else if (sk->sk_state == TCP_NEW_SYN_RECV) reqsk_free(inet_reqsk(sk)); else sk_free(sk); } EXPORT_SYMBOL_GPL(sock_gen_put); void sock_edemux(struct sk_buff *skb) { sock_gen_put(skb->sk); } EXPORT_SYMBOL(sock_edemux); struct sock *__inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif) { INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); struct sock *sk; const struct hlist_nulls_node *node; /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); unsigned int slot = hash & hashinfo->ehash_mask; struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) continue; if (likely(INET_MATCH(net, sk, acookie, ports, dif, sdif))) { if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; if (unlikely(!INET_MATCH(net, sk, acookie, ports, dif, sdif))) { sock_gen_put(sk); goto begin; } goto found; } } /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (get_nulls_value(node) != slot) goto begin; out: sk = NULL; found: return sk; } EXPORT_SYMBOL_GPL(__inet_lookup_established); /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, struct inet_timewait_sock **twp) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); __be32 daddr = inet->inet_rcv_saddr; __be32 saddr = inet->inet_daddr; int dif = sk->sk_bound_dev_if; struct net *net = sock_net(sk); int sdif = l3mdev_master_ifindex_by_index(net, dif); INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); spinlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { if (sk2->sk_hash != hash) continue; if (likely(INET_MATCH(net, sk2, acookie, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); if (twsk_unique(sk, sk2, twp)) break; } goto not_unique; } } /* Must record num and sport now. Otherwise we will see * in hash table socket with a funny identity. */ inet->inet_num = lport; inet->inet_sport = htons(lport); sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { sk_nulls_del_node_init_rcu((struct sock *)tw); __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule_put(tw); } return 0; not_unique: spin_unlock(lock); return -EADDRNOTAVAIL; } static u64 inet_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, inet->inet_daddr, inet->inet_dport); } /* Searches for an exsiting socket in the ehash bucket list. * Returns true if found, false otherwise. */ static bool inet_ehash_lookup_by_sk(struct sock *sk, struct hlist_nulls_head *list) { const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); const int sdif = sk->sk_bound_dev_if; const int dif = sk->sk_bound_dev_if; const struct hlist_nulls_node *node; struct net *net = sock_net(sk); struct sock *esk; INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); sk_nulls_for_each_rcu(esk, node, list) { if (esk->sk_hash != sk->sk_hash) continue; if (sk->sk_family == AF_INET) { if (unlikely(INET_MATCH(net, esk, acookie, ports, dif, sdif))) { return true; } } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { if (unlikely(inet6_match(net, esk, &sk->sk_v6_daddr, &sk->sk_v6_rcv_saddr, ports, dif, sdif))) { return true; } } #endif } return false; } /* Insert a socket into ehash, and eventually remove another one * (The another one can be a SYN_RECV or TIMEWAIT) * If an existing socket already exists, socket sk is not inserted, * and sets found_dup_sk parameter to true. */ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; struct inet_ehash_bucket *head; spinlock_t *lock; bool ret = true; WARN_ON_ONCE(!sk_unhashed(sk)); sk->sk_hash = sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock(lock); if (osk) { WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); ret = sk_nulls_del_node_init_rcu(osk); } else if (found_dup_sk) { *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); if (*found_dup_sk) ret = false; } if (ret) __sk_nulls_add_node_rcu(sk, list); spin_unlock(lock); return ret; } bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) { bool ok = inet_ehash_insert(sk, osk, found_dup_sk); if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { this_cpu_inc(*sk->sk_prot->orphan_count); inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); } return ok; } EXPORT_SYMBOL_GPL(inet_ehash_nolisten); static int inet_reuseport_add_sock(struct sock *sk, struct inet_listen_hashbucket *ilb) { struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; const struct hlist_nulls_node *node; struct sock *sk2; kuid_t uid = sock_i_uid(sk); sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { if (sk2 != sk && sk2->sk_family == sk->sk_family && ipv6_only_sock(sk2) == ipv6_only_sock(sk) && sk2->sk_bound_dev_if == sk->sk_bound_dev_if && inet_csk(sk2)->icsk_bind_hash == tb && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) return reuseport_add_sock(sk, sk2, inet_rcv_saddr_any(sk)); } return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } int __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb2; int err = 0; if (sk->sk_state != TCP_LISTEN) { local_bh_disable(); inet_ehash_nolisten(sk, osk, NULL); local_bh_enable(); return 0; } WARN_ON(!sk_unhashed(sk)); ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); spin_lock(&ilb2->lock); if (sk->sk_reuseport) { err = inet_reuseport_add_sock(sk, ilb2); if (err) goto unlock; } sock_set_flag(sk, SOCK_RCU_FREE); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); else __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: spin_unlock(&ilb2->lock); return err; } EXPORT_SYMBOL(__inet_hash); int inet_hash(struct sock *sk) { int err = 0; if (sk->sk_state != TCP_CLOSE) err = __inet_hash(sk, NULL); return err; } EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; if (sk_unhashed(sk)) return; if (sk->sk_state == TCP_LISTEN) { struct inet_listen_hashbucket *ilb2; ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); /* Don't disable bottom halves while acquiring the lock to * avoid circular locking dependency on PREEMPT_RT. */ spin_lock(&ilb2->lock); if (sk_unhashed(sk)) { spin_unlock(&ilb2->lock); return; } if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_stop_listen_sock(sk); __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(&ilb2->lock); } else { spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); if (sk_unhashed(sk)) { spin_unlock_bh(lock); return; } __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); } } EXPORT_SYMBOL_GPL(inet_unhash); /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') * because 2^16 is not a multiple of num_ephemeral and this * property might be used by clever attacker. * * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though * attacks were since demonstrated, thus we use 65536 by default instead * to really give more isolation and privacy, at the expense of 256kB * of kernel memory. */ #define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) static u32 *table_perturb; int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **)) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_timewait_sock *tw = NULL; struct inet_bind_hashbucket *head; int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); struct inet_bind_bucket *tb; u32 remaining, offset; int ret, i, low, high; int l3mdev; u32 index; if (port) { local_bh_disable(); ret = check_established(death_row, sk, port, NULL); local_bh_enable(); return ret; } l3mdev = inet_sk_bound_l3mdev(sk); inet_get_local_port_range(net, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ remaining = high - low; if (likely(remaining > 1)) remaining &= ~1U; get_random_slow_once(table_perturb, INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); offset %= remaining; /* In first pass we try ports of @low parity. * inet_csk_get_port() does the opposite choice. */ offset &= ~1U; other_parity_scan: port = low + offset; for (i = 0; i < remaining; i += 2, port += 2) { if (unlikely(port >= high)) port -= remaining; if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); /* Does not bother with rcv_saddr checks, because * the established check is already unique enough. */ inet_bind_bucket_for_each(tb, &head->chain) { if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && tb->port == port) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) goto next_port; WARN_ON(hlist_empty(&tb->owners)); if (!check_established(death_row, sk, port, &tw)) goto ok; goto next_port; } } tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port, l3mdev); if (!tb) { spin_unlock_bh(&head->lock); return -ENOMEM; } tb->fastreuse = -1; tb->fastreuseport = -1; goto ok; next_port: spin_unlock_bh(&head->lock); cond_resched(); } offset++; if ((offset & 1) && remaining > 1) goto other_parity_scan; return -EADDRNOTAVAIL; ok: /* Here we want to add a little bit of randomness to the next source * port that will be chosen. We use a max() with a random here so that * on low contention the randomness is maximal and on high contention * it may be inexistent. */ i = max_t(int, i, (prandom_u32() & 7) * 2); WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); inet_ehash_nolisten(sk, (struct sock *)tw, NULL); } if (tw) inet_twsk_bind_unhash(tw, hinfo); spin_unlock(&head->lock); if (tw) inet_twsk_deschedule_put(tw); local_bh_enable(); return 0; } /* * Bind a port for a connect operation and hash it. */ int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { u64 port_offset = 0; if (!inet_sk(sk)->inet_num) port_offset = inet_sk_port_offset(sk); return __inet_hash_connect(death_row, sk, port_offset, __inet_check_established); } EXPORT_SYMBOL_GPL(inet_hash_connect); static void init_hashinfo_lhash2(struct inet_hashinfo *h) { int i; for (i = 0; i <= h->lhash2_mask; i++) { spin_lock_init(&h->lhash2[i].lock); INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, i + LISTENING_NULLS_BASE); } } void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit) { h->lhash2 = alloc_large_system_hash(name, sizeof(*h->lhash2), numentries, scale, 0, NULL, &h->lhash2_mask, low_limit, high_limit); init_hashinfo_lhash2(h); /* this one is used for source ports of outgoing connections */ table_perturb = alloc_large_system_hash("Table-perturb", sizeof(*table_perturb), INET_TABLE_PERTURB_SIZE, 0, 0, NULL, NULL, INET_TABLE_PERTURB_SIZE, INET_TABLE_PERTURB_SIZE); } int inet_hashinfo2_init_mod(struct inet_hashinfo *h) { h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); if (!h->lhash2) return -ENOMEM; h->lhash2_mask = INET_LHTABLE_SIZE - 1; /* INET_LHTABLE_SIZE must be a power of 2 */ BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); init_hashinfo_lhash2(h); return 0; } EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); unsigned int i, nblocks = 1; if (locksz != 0) { /* allocate 2 cache lines or at least one spinlock per cpu */ nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); /* no more locks than number of hash buckets */ nblocks = min(nblocks, hashinfo->ehash_mask + 1); hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); if (!hashinfo->ehash_locks) return -ENOMEM; for (i = 0; i < nblocks; i++) spin_lock_init(&hashinfo->ehash_locks[i]); } hashinfo->ehash_locks_mask = nblocks - 1; return 0; } EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); |
239 241 203 203 2 2 96 26 5 26 2 9 22 538 539 478 154 37 248 155 163 115 56 254 25 171 86 100 28 97 97 560 8 574 13 571 11 59 3 4 19 566 461 109 266 65 257 56 263 4 306 250 303 108 1 191 113 1 303 78 241 227 12 31 302 250 15 15 12 144 3 596 3 465 1 4 464 466 317 465 155 291 169 136 30 422 309 282 160 437 10 431 468 2 149 287 159 18 5 135 5 134 68 67 272 2 1 17 10 143 2 3 146 146 146 141 5 2 7 101 100 7 3 2 265 12 266 10 102 185 184 20 5 17 20 20 9 20 3 5 2 12 2 21 1 20 8 32 14 2 53 1 52 2 4 27 14 44 289 1 1 7 8 282 281 282 39 19 18 3 35 29 5 1 2 3 6 2 2 3 7 2 8 8 3 5 6 2 10 10 3 13 7 11 33 7 3 2 4 18 41 1 4 4 33 36 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 | // SPDX-License-Identifier: GPL-2.0-only /* * "splice": joining two ropes together by interweaving their strands. * * This is the "extended pipe" functionality, where a pipe is used as * an arbitrary in-memory buffer. Think of a pipe as a small kernel * buffer that you can use to transfer data from one end to the other. * * The traditional unix read/write is extended with a "splice()" operation * that transfers data buffers to or from a pipe buffer. * * Named by Larry McVoy, original implementation from Linus, extended by * Jens to support splicing to files, network, direct splicing, etc and * fixing lots of bugs. * * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> * */ #include <linux/bvec.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/uio.h> #include <linux/security.h> #include <linux/gfp.h> #include <linux/socket.h> #include <linux/sched/signal.h> #include "internal.h" /* * Attempt to steal a page from a pipe buffer. This should perhaps go into * a vm helper function, it's already simplified quite a bit by the * addition of remove_mapping(). If success is returned, the caller may * attempt to reuse this page for another destination. */ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; struct address_space *mapping; lock_page(page); mapping = page_mapping(page); if (mapping) { WARN_ON(!PageUptodate(page)); /* * At least for ext2 with nobh option, we need to wait on * writeback completing on this page, since we'll remove it * from the pagecache. Otherwise truncate wont wait on the * page, allowing the disk blocks to be reused by someone else * before we actually wrote our data to them. fs corruption * ensues. */ wait_on_page_writeback(page); if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) goto out_unlock; /* * If we succeeded in removing the mapping, set LRU flag * and return good. */ if (remove_mapping(mapping, page)) { buf->flags |= PIPE_BUF_FLAG_LRU; return true; } } /* * Raced with truncate or failed to remove page from current * address space, unlock and return failure. */ out_unlock: unlock_page(page); return false; } static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { put_page(buf->page); buf->flags &= ~PIPE_BUF_FLAG_LRU; } /* * Check whether the contents of buf is OK to access. Since the content * is a page cache page, IO may be in flight. */ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; int err; if (!PageUptodate(page)) { lock_page(page); /* * Page got truncated/unhashed. This will cause a 0-byte * splice, if this is the first page. */ if (!page->mapping) { err = -ENODATA; goto error; } /* * Uh oh, read-error from disk. */ if (!PageUptodate(page)) { err = -EIO; goto error; } /* * Page is ok afterall, we are done. */ unlock_page(page); } return 0; error: unlock_page(page); return err; } const struct pipe_buf_operations page_cache_pipe_buf_ops = { .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .try_steal = page_cache_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) return false; buf->flags |= PIPE_BUF_FLAG_LRU; return generic_pipe_buf_try_steal(pipe, buf); } static const struct pipe_buf_operations user_page_pipe_buf_ops = { .release = page_cache_pipe_buf_release, .try_steal = user_page_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; static void wakeup_pipe_readers(struct pipe_inode_info *pipe) { smp_mb(); if (waitqueue_active(&pipe->rd_wait)) wake_up_interruptible(&pipe->rd_wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } /** * splice_to_pipe - fill passed data into a pipe * @pipe: pipe to fill * @spd: data to fill * * Description: * @spd contains a map of pages and len/offset tuples, along with * the struct pipe_buf_operations associated with these pages. This * function will link that data to the pipe. * */ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int spd_pages = spd->nr_pages; unsigned int tail = pipe->tail; unsigned int head = pipe->head; unsigned int mask = pipe->ring_size - 1; int ret = 0, page_nr = 0; if (!spd_pages) return 0; if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } while (!pipe_full(head, tail, pipe->max_usage)) { struct pipe_buffer *buf = &pipe->bufs[head & mask]; buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; buf->len = spd->partial[page_nr].len; buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; buf->flags = 0; head++; pipe->head = head; page_nr++; ret += buf->len; if (!--spd->nr_pages) break; } if (!ret) ret = -EAGAIN; out: while (page_nr < spd_pages) spd->spd_release(spd, page_nr++); return ret; } EXPORT_SYMBOL_GPL(splice_to_pipe); ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; int ret; if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; } else if (pipe_full(head, tail, pipe->max_usage)) { ret = -EAGAIN; } else { pipe->bufs[head & mask] = *buf; pipe->head = head + 1; return buf->len; } pipe_buf_release(pipe, buf); return ret; } EXPORT_SYMBOL(add_to_pipe); /* * Check if we need to grow the arrays holding pages and partial page * descriptions. */ int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int max_usage = READ_ONCE(pipe->max_usage); spd->nr_pages_max = max_usage; if (max_usage <= PIPE_DEF_BUFFERS) return 0; spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL); spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page), GFP_KERNEL); if (spd->pages && spd->partial) return 0; kfree(spd->pages); kfree(spd->partial); return -ENOMEM; } void splice_shrink_spd(struct splice_pipe_desc *spd) { if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) return; kfree(spd->pages); kfree(spd->partial); } /** * generic_file_splice_read - splice data from file to a pipe * @in: file to splice from * @ppos: position in @in * @pipe: pipe to splice to * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will read pages from given file and fill them into a pipe. Can be * used as long as it has more or less sane ->read_iter(). * */ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct iov_iter to; struct kiocb kiocb; unsigned int i_head; int ret; iov_iter_pipe(&to, READ, pipe, len); i_head = to.head; init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; ret = call_read_iter(in, &kiocb, &to); if (ret > 0) { *ppos = kiocb.ki_pos; file_accessed(in); } else if (ret < 0) { to.head = i_head; to.iov_offset = 0; iov_iter_advance(&to, 0); /* to free what was emitted */ /* * callers of ->splice_read() expect -EAGAIN on * "can't put anything in there", rather than -EFAULT. */ if (ret == -EFAULT) ret = -EAGAIN; } return ret; } EXPORT_SYMBOL(generic_file_splice_read); const struct pipe_buf_operations default_pipe_buf_ops = { .release = generic_pipe_buf_release, .try_steal = generic_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { .release = generic_pipe_buf_release, .get = generic_pipe_buf_get, }; EXPORT_SYMBOL(nosteal_pipe_buf_ops); /* * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' * using sendpage(). Return the number of bytes sent. */ static int pipe_to_sendpage(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct file *file = sd->u.file; loff_t pos = sd->pos; int more; if (!likely(file->f_op->sendpage)) return -EINVAL; more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; if (sd->len < sd->total_len && pipe_occupancy(pipe->head, pipe->tail) > 1) more |= MSG_SENDPAGE_NOTLAST; return file->f_op->sendpage(file, buf->page, buf->offset, sd->len, &pos, more); } static void wakeup_pipe_writers(struct pipe_inode_info *pipe) { smp_mb(); if (waitqueue_active(&pipe->wr_wait)) wake_up_interruptible(&pipe->wr_wait); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } /** * splice_from_pipe_feed - feed available data from a pipe to a file * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: * This function loops over the pipe and calls @actor to do the * actual moving of a single struct pipe_buffer to the desired * destination. It returns when there's no more buffers left in * the pipe or if the requested number of bytes (@sd->total_len) * have been copied. It returns a positive number (one) if the * pipe needs to be filled with more data, zero if the required * number of bytes have been copied and -errno on error. * * This, together with splice_from_pipe_{begin,end,next}, may be * used to implement the functionality of __splice_from_pipe() when * locking is required around copying the pipe buffers to the * destination. */ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; int ret; while (!pipe_empty(head, tail)) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; sd->len = buf->len; if (sd->len > sd->total_len) sd->len = sd->total_len; ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; return ret; } ret = actor(pipe, buf, sd); if (ret <= 0) return ret; buf->offset += ret; buf->len -= ret; sd->num_spliced += ret; sd->len -= ret; sd->pos += ret; sd->total_len -= ret; if (!buf->len) { pipe_buf_release(pipe, buf); tail++; pipe->tail = tail; if (pipe->files) sd->need_wakeup = true; } if (!sd->total_len) return 0; } return 1; } /* We know we have a pipe buffer, but maybe it's empty? */ static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) { unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf = &pipe->bufs[tail & mask]; if (unlikely(!buf->len)) { pipe_buf_release(pipe, buf); pipe->tail = tail+1; return true; } return false; } /** * splice_from_pipe_next - wait for some data to splice from * @pipe: pipe to splice from * @sd: information about the splice operation * * Description: * This function will wait for some data and return a positive * value (one) if pipe buffers are available. It will return zero * or -errno if no more data needs to be spliced. */ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) { /* * Check for signal early to make process killable when there are * always buffers available */ if (signal_pending(current)) return -ERESTARTSYS; repeat: while (pipe_empty(pipe->head, pipe->tail)) { if (!pipe->writers) return 0; if (sd->num_spliced) return 0; if (sd->flags & SPLICE_F_NONBLOCK) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; if (sd->need_wakeup) { wakeup_pipe_writers(pipe); sd->need_wakeup = false; } pipe_wait_readable(pipe); } if (eat_empty_buffer(pipe)) goto repeat; return 1; } /** * splice_from_pipe_begin - start splicing from pipe * @sd: information about the splice operation * * Description: * This function should be called before a loop containing * splice_from_pipe_next() and splice_from_pipe_feed() to * initialize the necessary fields of @sd. */ static void splice_from_pipe_begin(struct splice_desc *sd) { sd->num_spliced = 0; sd->need_wakeup = false; } /** * splice_from_pipe_end - finish splicing from pipe * @pipe: pipe to splice from * @sd: information about the splice operation * * Description: * This function will wake up pipe writers if necessary. It should * be called after a loop containing splice_from_pipe_next() and * splice_from_pipe_feed(). */ static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) { if (sd->need_wakeup) wakeup_pipe_writers(pipe); } /** * __splice_from_pipe - splice data from a pipe to given actor * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: * This function does little more than loop over the pipe and call * @actor to do the actual moving of a single struct pipe_buffer to * the desired destination. See pipe_to_file, pipe_to_sendpage, or * pipe_to_user. * */ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { int ret; splice_from_pipe_begin(sd); do { cond_resched(); ret = splice_from_pipe_next(pipe, sd); if (ret > 0) ret = splice_from_pipe_feed(pipe, sd, actor); } while (ret > 0); splice_from_pipe_end(pipe, sd); return sd->num_spliced ? sd->num_spliced : ret; } EXPORT_SYMBOL(__splice_from_pipe); /** * splice_from_pipe - splice data from a pipe to a file * @pipe: pipe to splice from * @out: file to splice to * @ppos: position in @out * @len: how many bytes to splice * @flags: splice modifier flags * @actor: handler that splices the data * * Description: * See __splice_from_pipe. This function locks the pipe inode, * otherwise it's identical to __splice_from_pipe(). * */ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags, splice_actor *actor) { ssize_t ret; struct splice_desc sd = { .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, }; pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, actor); pipe_unlock(pipe); return ret; } /** * iter_file_splice_write - splice data from a pipe to a file * @pipe: pipe info * @out: file to write to * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will either move or copy pages (determined by @flags options) from * the given pipe inode to the given file. * This one is ->write_iter-based. * */ ssize_t iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct splice_desc sd = { .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, }; int nbufs = pipe->max_usage; struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); ssize_t ret; if (unlikely(!array)) return -ENOMEM; pipe_lock(pipe); splice_from_pipe_begin(&sd); while (sd.total_len) { struct iov_iter from; unsigned int head, tail, mask; size_t left; int n; ret = splice_from_pipe_next(pipe, &sd); if (ret <= 0) break; if (unlikely(nbufs < pipe->max_usage)) { kfree(array); nbufs = pipe->max_usage; array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); if (!array) { ret = -ENOMEM; break; } } head = pipe->head; tail = pipe->tail; mask = pipe->ring_size - 1; /* build the vector */ left = sd.total_len; for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t this_len = buf->len; /* zero-length bvecs are not supported, skip them */ if (!this_len) continue; this_len = min(this_len, left); ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; goto done; } array[n].bv_page = buf->page; array[n].bv_len = this_len; array[n].bv_offset = buf->offset; left -= this_len; n++; } iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left); ret = vfs_iter_write(out, &from, &sd.pos, 0); if (ret <= 0) break; sd.num_spliced += ret; sd.total_len -= ret; *ppos = sd.pos; /* dismiss the fully eaten buffers, adjust the partial one */ tail = pipe->tail; while (ret) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; if (ret >= buf->len) { ret -= buf->len; buf->len = 0; pipe_buf_release(pipe, buf); tail++; pipe->tail = tail; if (pipe->files) sd.need_wakeup = true; } else { buf->offset += ret; buf->len -= ret; ret = 0; } } } done: kfree(array); splice_from_pipe_end(pipe, &sd); pipe_unlock(pipe); if (sd.num_spliced) ret = sd.num_spliced; return ret; } EXPORT_SYMBOL(iter_file_splice_write); /** * generic_splice_sendpage - splice data from a pipe to a socket * @pipe: pipe to splice from * @out: socket to write to * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will send @len bytes from the pipe to a network socket. No data copying * is involved. * */ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); } EXPORT_SYMBOL(generic_splice_sendpage); static int warn_unsupported(struct file *file, const char *op) { pr_debug_ratelimited( "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } /* * Attempt to initiate a splice from pipe to file. */ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { if (unlikely(!out->f_op->splice_write)) return warn_unsupported(out, "write"); return out->f_op->splice_write(pipe, out, ppos, len, flags); } /* * Attempt to initiate a splice from a file to a pipe. */ static long do_splice_to(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { unsigned int p_space; int ret; if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; /* Don't try to read more the pipe has space for. */ p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); len = min_t(size_t, len, p_space << PAGE_SHIFT); ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) return ret; if (unlikely(len > MAX_RW_COUNT)) len = MAX_RW_COUNT; if (unlikely(!in->f_op->splice_read)) return warn_unsupported(in, "read"); return in->f_op->splice_read(in, ppos, pipe, len, flags); } /** * splice_direct_to_actor - splices data directly between two non-pipes * @in: file to splice from * @sd: actor information on where to splice to * @actor: handles the data splicing * * Description: * This is a special case helper to splice directly between two * points, without requiring an explicit pipe. Internally an allocated * pipe is cached in the process, and reused during the lifetime of * that process. * */ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, splice_direct_actor *actor) { struct pipe_inode_info *pipe; long ret, bytes; umode_t i_mode; size_t len; int i, flags, more; /* * We require the input being a regular file, as we don't want to * randomly drop data for eg socket -> socket splicing. Use the * piped splicing for that! */ i_mode = file_inode(in)->i_mode; if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) return -EINVAL; /* * neither in nor out is a pipe, setup an internal pipe attached to * 'out' and transfer the wanted data from 'in' to 'out' through that */ pipe = current->splice_pipe; if (unlikely(!pipe)) { pipe = alloc_pipe_info(); if (!pipe) return -ENOMEM; /* * We don't have an immediate reader, but we'll read the stuff * out of the pipe right after the splice_to_pipe(). So set * PIPE_READERS appropriately. */ pipe->readers = 1; current->splice_pipe = pipe; } /* * Do the splice. */ ret = 0; bytes = 0; len = sd->total_len; flags = sd->flags; /* * Don't block on output, we have to drain the direct pipe. */ sd->flags &= ~SPLICE_F_NONBLOCK; more = sd->flags & SPLICE_F_MORE; WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail)); while (len) { size_t read_len; loff_t pos = sd->pos, prev_pos = pos; ret = do_splice_to(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) goto out_release; read_len = ret; sd->total_len = read_len; /* * If more data is pending, set SPLICE_F_MORE * If this is the last data and SPLICE_F_MORE was not set * initially, clears it. */ if (read_len < len) sd->flags |= SPLICE_F_MORE; else if (!more) sd->flags &= ~SPLICE_F_MORE; /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: */ ret = actor(pipe, sd); if (unlikely(ret <= 0)) { sd->pos = prev_pos; goto out_release; } bytes += ret; len -= ret; sd->pos = pos; if (ret < read_len) { sd->pos = prev_pos + ret; goto out_release; } } done: pipe->tail = pipe->head = 0; file_accessed(in); return bytes; out_release: /* * If we did an incomplete transfer we must release * the pipe buffers in question: */ for (i = 0; i < pipe->ring_size; i++) { struct pipe_buffer *buf = &pipe->bufs[i]; if (buf->ops) pipe_buf_release(pipe, buf); } if (!bytes) bytes = ret; goto done; } EXPORT_SYMBOL(splice_direct_to_actor); static int direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { struct file *file = sd->u.file; return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); } /** * do_splice_direct - splices data directly between two files * @in: file to splice from * @ppos: input file offset * @out: file to splice to * @opos: output file offset * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * For use by do_sendfile(). splice can easily emulate sendfile, but * doing it in the application would incur an extra system call * (splice in + splice out, as compared to just sendfile()). So this helper * can splice directly through a process-private pipe. * */ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len, unsigned int flags) { struct splice_desc sd = { .len = len, .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, .opos = opos, }; long ret; if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; ret = rw_verify_area(WRITE, out, opos, len); if (unlikely(ret < 0)) return ret; ret = splice_direct_to_actor(in, &sd, direct_splice_actor); if (ret > 0) *ppos = sd.pos; return ret; } EXPORT_SYMBOL(do_splice_direct); static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) { for (;;) { if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); return -EPIPE; } if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) return 0; if (flags & SPLICE_F_NONBLOCK) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; pipe_wait_writable(pipe); } } static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); long splice_file_to_pipe(struct file *in, struct pipe_inode_info *opipe, loff_t *offset, size_t len, unsigned int flags) { long ret; pipe_lock(opipe); ret = wait_for_space(opipe, flags); if (!ret) ret = do_splice_to(in, offset, opipe, len, flags); pipe_unlock(opipe); if (ret > 0) wakeup_pipe_readers(opipe); return ret; } /* * Determine where to splice to/from. */ long do_splice(struct file *in, loff_t *off_in, struct file *out, loff_t *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset; long ret; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) return -EBADF; ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); if (ipipe && opipe) { if (off_in || off_out) return -ESPIPE; /* Splicing to self would be fun, but... */ if (ipipe == opipe) return -EINVAL; if ((in->f_flags | out->f_flags) & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; return splice_pipe_to_pipe(ipipe, opipe, len, flags); } if (ipipe) { if (off_in) return -ESPIPE; if (off_out) { if (!(out->f_mode & FMODE_PWRITE)) return -EINVAL; offset = *off_out; } else { offset = out->f_pos; } if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; ret = rw_verify_area(WRITE, out, &offset, len); if (unlikely(ret < 0)) return ret; if (in->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; file_start_write(out); ret = do_splice_from(ipipe, out, &offset, len, flags); file_end_write(out); if (!off_out) out->f_pos = offset; else *off_out = offset; return ret; } if (opipe) { if (off_out) return -ESPIPE; if (off_in) { if (!(in->f_mode & FMODE_PREAD)) return -EINVAL; offset = *off_in; } else { offset = in->f_pos; } if (out->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; ret = splice_file_to_pipe(in, opipe, &offset, len, flags); if (!off_in) in->f_pos = offset; else *off_in = offset; return ret; } return -EINVAL; } static long __do_splice(struct file *in, loff_t __user *off_in, struct file *out, loff_t __user *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset, *__off_in = NULL, *__off_out = NULL; long ret; ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); if (ipipe && off_in) return -ESPIPE; if (opipe && off_out) return -ESPIPE; if (off_out) { if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; __off_out = &offset; } if (off_in) { if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; __off_in = &offset; } ret = do_splice(in, __off_in, out, __off_out, len, flags); if (ret < 0) return ret; if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t))) return -EFAULT; if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t))) return -EFAULT; return ret; } static int iter_to_pipe(struct iov_iter *from, struct pipe_inode_info *pipe, unsigned flags) { struct pipe_buffer buf = { .ops = &user_page_pipe_buf_ops, .flags = flags }; size_t total = 0; int ret = 0; bool failed = false; while (iov_iter_count(from) && !failed) { struct page *pages[16]; ssize_t copied; size_t start; int n; copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start); if (copied <= 0) { ret = copied; break; } for (n = 0; copied; n++, start = 0) { int size = min_t(int, copied, PAGE_SIZE - start); if (!failed) { buf.page = pages[n]; buf.offset = start; buf.len = size; ret = add_to_pipe(pipe, &buf); if (unlikely(ret < 0)) { failed = true; } else { iov_iter_advance(from, ret); total += ret; } } else { put_page(pages[n]); } copied -= size; } } return total ? total : ret; } static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); return n == sd->len ? n : -EFAULT; } /* * For lack of a better implementation, implement vmsplice() to userspace * as a simple copy of the pipes pages to the user iov. */ static long vmsplice_to_user(struct file *file, struct iov_iter *iter, unsigned int flags) { struct pipe_inode_info *pipe = get_pipe_info(file, true); struct splice_desc sd = { .total_len = iov_iter_count(iter), .flags = flags, .u.data = iter }; long ret = 0; if (!pipe) return -EBADF; if (sd.total_len) { pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); } return ret; } /* * vmsplice splices a user address range into a pipe. It can be thought of * as splice-from-memory, where the regular splice is splice-from-file (or * to file). In both cases the output is a pipe, naturally. */ static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, unsigned int flags) { struct pipe_inode_info *pipe; long ret = 0; unsigned buf_flag = 0; if (flags & SPLICE_F_GIFT) buf_flag = PIPE_BUF_FLAG_GIFT; pipe = get_pipe_info(file, true); if (!pipe) return -EBADF; pipe_lock(pipe); ret = wait_for_space(pipe, flags); if (!ret) ret = iter_to_pipe(iter, pipe, buf_flag); pipe_unlock(pipe); if (ret > 0) wakeup_pipe_readers(pipe); return ret; } static int vmsplice_type(struct fd f, int *type) { if (!f.file) return -EBADF; if (f.file->f_mode & FMODE_WRITE) { *type = WRITE; } else if (f.file->f_mode & FMODE_READ) { *type = READ; } else { fdput(f); return -EBADF; } return 0; } /* * Note that vmsplice only really supports true splicing _from_ user memory * to a pipe, not the other way around. Splicing from user memory is a simple * operation that can be supported without any funky alignment restrictions * or nasty vm tricks. We simply map in the user memory and fill them into * a pipe. The reverse isn't quite as easy, though. There are two possible * solutions for that: * * - memcpy() the data internally, at which point we might as well just * do a regular read() on the buffer anyway. * - Lots of nasty vm tricks, that are neither fast nor flexible (it * has restriction limitations on both ends of the pipe). * * Currently we punt and implement it as a normal copy, see pipe_to_user(). * */ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, unsigned long, nr_segs, unsigned int, flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; ssize_t error; struct fd f; int type; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; f = fdget(fd); error = vmsplice_type(f, &type); if (error) return error; error = import_iovec(type, uiov, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); if (error < 0) goto out_fdput; if (!iov_iter_count(&iter)) error = 0; else if (iov_iter_rw(&iter) == WRITE) error = vmsplice_to_pipe(f.file, &iter, flags); else error = vmsplice_to_user(f.file, &iter, flags); kfree(iov); out_fdput: fdput(f); return error; } SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { struct fd in, out; long error; if (unlikely(!len)) return 0; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; error = -EBADF; in = fdget(fd_in); if (in.file) { out = fdget(fd_out); if (out.file) { error = __do_splice(in.file, off_in, out.file, off_out, len, flags); fdput(out); } fdput(in); } return error; } /* * Make sure there's data to read. Wait for input if we can, otherwise * return an appropriate error. */ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; /* * Check the pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ if (!pipe_empty(pipe->head, pipe->tail)) return 0; ret = 0; pipe_lock(pipe); while (pipe_empty(pipe->head, pipe->tail)) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } if (!pipe->writers) break; if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } pipe_wait_readable(pipe); } pipe_unlock(pipe); return ret; } /* * Make sure there's writeable room. Wait for room if we can, otherwise * return an appropriate error. */ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; /* * Check pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) return 0; ret = 0; pipe_lock(pipe); while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; break; } if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } pipe_wait_writable(pipe); } pipe_unlock(pipe); return ret; } /* * Splice contents of ipipe to opipe. */ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; unsigned int i_mask, o_mask; int ret = 0; bool input_wakeup = false; retry: ret = ipipe_prep(ipipe, flags); if (ret) return ret; ret = opipe_prep(opipe, flags); if (ret) return ret; /* * Potential ABBA deadlock, work around it by ordering lock * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; i_mask = ipipe->ring_size - 1; o_head = opipe->head; o_mask = opipe->ring_size - 1; do { size_t o_len; if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } i_head = ipipe->head; o_tail = opipe->tail; if (pipe_empty(i_head, i_tail) && !ipipe->writers) break; /* * Cannot make any progress, because either the input * pipe is empty or the output pipe is full. */ if (pipe_empty(i_head, i_tail) || pipe_full(o_head, o_tail, opipe->max_usage)) { /* Already processed some buffers, break */ if (ret) break; if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } /* * We raced with another reader/writer and haven't * managed to process any buffers. A zero return * value means EOF, so retry instead. */ pipe_unlock(ipipe); pipe_unlock(opipe); goto retry; } ibuf = &ipipe->bufs[i_tail & i_mask]; obuf = &opipe->bufs[o_head & o_mask]; if (len >= ibuf->len) { /* * Simply move the whole buffer from ipipe to opipe */ *obuf = *ibuf; ibuf->ops = NULL; i_tail++; ipipe->tail = i_tail; input_wakeup = true; o_len = obuf->len; o_head++; opipe->head = o_head; } else { /* * Get a reference to this pipe buffer, * so we can copy the contents over. */ if (!pipe_buf_get(ipipe, ibuf)) { if (ret == 0) ret = -EFAULT; break; } *obuf = *ibuf; /* * Don't inherit the gift and merge flags, we need to * prevent multiple steals of this page. */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; obuf->len = len; ibuf->offset += len; ibuf->len -= len; o_len = len; o_head++; opipe->head = o_head; } ret += o_len; len -= o_len; } while (len); pipe_unlock(ipipe); pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. */ if (ret > 0) wakeup_pipe_readers(opipe); if (input_wakeup) wakeup_pipe_writers(ipipe); return ret; } /* * Link contents of ipipe to opipe. */ static int link_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; unsigned int i_mask, o_mask; int ret = 0; /* * Potential ABBA deadlock, work around it by ordering lock * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; i_mask = ipipe->ring_size - 1; o_head = opipe->head; o_mask = opipe->ring_size - 1; do { if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } i_head = ipipe->head; o_tail = opipe->tail; /* * If we have iterated all input buffers or run out of * output room, break. */ if (pipe_empty(i_head, i_tail) || pipe_full(o_head, o_tail, opipe->max_usage)) break; ibuf = &ipipe->bufs[i_tail & i_mask]; obuf = &opipe->bufs[o_head & o_mask]; /* * Get a reference to this pipe buffer, * so we can copy the contents over. */ if (!pipe_buf_get(ipipe, ibuf)) { if (ret == 0) ret = -EFAULT; break; } *obuf = *ibuf; /* * Don't inherit the gift and merge flag, we need to prevent * multiple steals of this page. */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; if (obuf->len > len) obuf->len = len; ret += obuf->len; len -= obuf->len; o_head++; opipe->head = o_head; i_tail++; } while (len); pipe_unlock(ipipe); pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. */ if (ret > 0) wakeup_pipe_readers(opipe); return ret; } /* * This is a tee(1) implementation that works on pipes. It doesn't copy * any data, it simply references the 'in' pages on the 'out' pipe. * The 'flags' used are the SPLICE_F_* variants, currently the only * applicable one is SPLICE_F_NONBLOCK. */ long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe = get_pipe_info(in, true); struct pipe_inode_info *opipe = get_pipe_info(out, true); int ret = -EINVAL; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) return -EBADF; /* * Duplicate the contents of ipipe to opipe without actually * copying the data. */ if (ipipe && opipe && ipipe != opipe) { if ((in->f_flags | out->f_flags) & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; /* * Keep going, unless we encounter an error. The ipipe/opipe * ordering doesn't really matter. */ ret = ipipe_prep(ipipe, flags); if (!ret) { ret = opipe_prep(opipe, flags); if (!ret) ret = link_pipe(ipipe, opipe, len, flags); } } return ret; } SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { struct fd in, out; int error; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; if (unlikely(!len)) return 0; error = -EBADF; in = fdget(fdin); if (in.file) { out = fdget(fdout); if (out.file) { error = do_tee(in.file, out.file, len, flags); fdput(out); } fdput(in); } return error; } |
142 2 25 24 2 108 1 117 26 32 32 15 20 283 1 12 259 177 177 81 177 2 188 84 197 31 65 30 68 66 105 68 182 182 1 1 1 4 24 3 2 23 78 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. */ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/highmem.h> #include <linux/fs.h> #include <linux/rwsem.h> #include <linux/semaphore.h> #include <linux/completion.h> #include <linux/backing-dev.h> #include <linux/wait.h> #include <linux/slab.h> #include <trace/events/btrfs.h> #include <asm/unaligned.h> #include <linux/pagemap.h> #include <linux/btrfs.h> #include <linux/btrfs_tree.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/sizes.h> #include <linux/dynamic_debug.h> #include <linux/refcount.h> #include <linux/crc32c.h> #include <linux/iomap.h> #include "extent-io-tree.h" #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" #include "block-rsv.h" #include "locking.h" struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_pending_snapshot; struct btrfs_delayed_ref_root; struct btrfs_space_info; struct btrfs_block_group; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; struct btrfs_ref; #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ /* * Maximum number of mirrors that can be available for all profiles counting * the target device of dev-replace as one. During an active device replace * procedure, the target device of the copy operation is a mirror for the * filesystem data as well that can be used to read data in order to repair * read errors on other disks. * * Current value is derived from RAID1C4 with 4 copies. */ #define BTRFS_MAX_MIRRORS (4 + 1) #define BTRFS_MAX_LEVEL 8 #define BTRFS_OLDEST_GENERATION 0ULL /* * we can actually store much bigger names, but lets not confuse the rest * of linux */ #define BTRFS_NAME_LEN 255 /* * Theoretical limit is larger, but we keep this down to a sane * value. That should limit greatly the possibility of collisions on * inode ref items. */ #define BTRFS_LINK_MAX 65535U #define BTRFS_EMPTY_DIR_SIZE 0 /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) #define BTRFS_DIRTY_METADATA_THRESH SZ_32M /* * Use large batch size to reduce overhead of metadata updates. On the reader * side, we only read it when we are close to ENOSPC and the read overhead is * mostly related to the number of CPUs, so it is OK to use arbitrary large * value here. */ #define BTRFS_TOTAL_BYTES_PINNED_BATCH SZ_128M #define BTRFS_MAX_EXTENT_SIZE SZ_128M /* * Deltas are an effective way to populate global statistics. Give macro names * to make it clear what we're doing. An example is discard_extents in * btrfs_free_space_ctl. */ #define BTRFS_STAT_NR_ENTRIES 2 #define BTRFS_STAT_CURR 0 #define BTRFS_STAT_PREV 1 static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); return sizeof(struct btrfs_chunk) + sizeof(struct btrfs_stripe) * (num_stripes - 1); } /* * Runtime (in-memory) states of filesystem */ enum { /* Global indicator of serious filesystem errors */ BTRFS_FS_STATE_ERROR, /* * Filesystem is being remounted, allow to skip some operations, like * defrag */ BTRFS_FS_STATE_REMOUNTING, /* Filesystem in RO mode */ BTRFS_FS_STATE_RO, /* Track if a transaction abort has been reported on this filesystem */ BTRFS_FS_STATE_TRANS_ABORTED, /* * Bio operations should be blocked on this filesystem because a source * or target device is being destroyed as part of a device replace */ BTRFS_FS_STATE_DEV_REPLACING, /* The btrfs_fs_info created for self-tests */ BTRFS_FS_STATE_DUMMY_FS_INFO, /* Indicates there was an error cleaning up a log tree. */ BTRFS_FS_STATE_LOG_CLEANUP_ERROR, }; #define BTRFS_BACKREF_REV_MAX 256 #define BTRFS_BACKREF_REV_SHIFT 56 #define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ BTRFS_BACKREF_REV_SHIFT) #define BTRFS_OLD_BACKREF_REV 0 #define BTRFS_MIXED_BACKREF_REV 1 /* * every tree block (leaf or node) starts with this header. */ struct btrfs_header { /* these first four must match the super block */ u8 csum[BTRFS_CSUM_SIZE]; u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ __le64 bytenr; /* which block this node is supposed to live in */ __le64 flags; /* allowed to be different from the super from here on down */ u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; __le64 generation; __le64 owner; __le32 nritems; u8 level; } __attribute__ ((__packed__)); /* * this is a very generous portion of the super block, giving us * room to translate 14 chunks with 3 stripes each. */ #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 /* * just in case we somehow lose the roots and are not able to mount, * we store an array of the roots from previous transactions * in the super. */ #define BTRFS_NUM_BACKUP_ROOTS 4 struct btrfs_root_backup { __le64 tree_root; __le64 tree_root_gen; __le64 chunk_root; __le64 chunk_root_gen; __le64 extent_root; __le64 extent_root_gen; __le64 fs_root; __le64 fs_root_gen; __le64 dev_root; __le64 dev_root_gen; __le64 csum_root; __le64 csum_root_gen; __le64 total_bytes; __le64 bytes_used; __le64 num_devices; /* future */ __le64 unused_64[4]; u8 tree_root_level; u8 chunk_root_level; u8 extent_root_level; u8 fs_root_level; u8 dev_root_level; u8 csum_root_level; /* future and to align */ u8 unused_8[10]; } __attribute__ ((__packed__)); /* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc */ struct btrfs_super_block { /* the first 4 fields must match struct btrfs_header */ u8 csum[BTRFS_CSUM_SIZE]; /* FS specific UUID, visible to user */ u8 fsid[BTRFS_FSID_SIZE]; __le64 bytenr; /* this block number */ __le64 flags; /* allowed to be different from the btrfs_header from here own down */ __le64 magic; __le64 generation; __le64 root; __le64 chunk_root; __le64 log_root; /* this will help find the new super based on the log root */ __le64 log_root_transid; __le64 total_bytes; __le64 bytes_used; __le64 root_dir_objectid; __le64 num_devices; __le32 sectorsize; __le32 nodesize; __le32 __unused_leafsize; __le32 stripesize; __le32 sys_chunk_array_size; __le64 chunk_root_generation; __le64 compat_flags; __le64 compat_ro_flags; __le64 incompat_flags; __le16 csum_type; u8 root_level; u8 chunk_root_level; u8 log_root_level; struct btrfs_dev_item dev_item; char label[BTRFS_LABEL_SIZE]; __le64 cache_generation; __le64 uuid_tree_generation; /* the UUID written into btree blocks */ u8 metadata_uuid[BTRFS_FSID_SIZE]; /* future expansion */ __le64 reserved[28]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; } __attribute__ ((__packed__)); /* * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount */ #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP \ (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ BTRFS_FEATURE_COMPAT_RO_VERITY) #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL #ifdef CONFIG_BTRFS_DEBUG /* * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG */ #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ BTRFS_FEATURE_INCOMPAT_ZONED | \ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) #else #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ BTRFS_FEATURE_INCOMPAT_ZONED) #endif #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL /* * A leaf is full of items. offset and size tell us where to find * the item in the leaf (relative to the start of the data area) */ struct btrfs_item { struct btrfs_disk_key key; __le32 offset; __le32 size; } __attribute__ ((__packed__)); /* * leaves have an item area and a data area: * [item0, item1....itemN] [free space] [dataN...data1, data0] * * The data is separate from the items to get the keys closer together * during searches. */ struct btrfs_leaf { struct btrfs_header header; struct btrfs_item items[]; } __attribute__ ((__packed__)); /* * all non-leaf blocks are nodes, they hold only keys and pointers to * other blocks */ struct btrfs_key_ptr { struct btrfs_disk_key key; __le64 blockptr; __le64 generation; } __attribute__ ((__packed__)); struct btrfs_node { struct btrfs_header header; struct btrfs_key_ptr ptrs[]; } __attribute__ ((__packed__)); /* Read ahead values for struct btrfs_path.reada */ enum { READA_NONE, READA_BACK, READA_FORWARD, /* * Similar to READA_FORWARD but unlike it: * * 1) It will trigger readahead even for leaves that are not close to * each other on disk; * 2) It also triggers readahead for nodes; * 3) During a search, even when a node or leaf is already in memory, it * will still trigger readahead for other nodes and leaves that follow * it. * * This is meant to be used only when we know we are iterating over the * entire tree or a very large part of it. */ READA_FORWARD_ALWAYS, }; /* * btrfs_paths remember the path taken from the root down to the leaf. * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point * to any other levels that are present. * * The slots array records the index of the item or block pointer * used while walking the tree. */ struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; /* if there is real range locking, this locks field will change */ u8 locks[BTRFS_MAX_LEVEL]; u8 reada; /* keep some upper locks as we walk down */ u8 lowest_level; /* * set by btrfs_split_item, tells search_slot to keep all locks * and to force calls to keep space in the nodes */ unsigned int search_for_split:1; unsigned int keep_locks:1; unsigned int skip_locking:1; unsigned int search_commit_root:1; unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; /* * Indicate that new item (btrfs_search_slot) is extending already * existing item and ins_len contains only the data size and not item * header (ie. sizeof(struct btrfs_item) is not included). */ unsigned int search_for_extension:1; }; #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) struct btrfs_dev_replace { u64 replace_state; /* see #define above */ time64_t time_started; /* seconds since 1-Jan-1970 */ time64_t time_stopped; /* seconds since 1-Jan-1970 */ atomic64_t num_write_errors; atomic64_t num_uncorrectable_read_errors; u64 cursor_left; u64 committed_cursor_left; u64 cursor_left_last_write_of_item; u64 cursor_right; u64 cont_reading_from_srcdev_mode; /* see #define above */ int is_valid; int item_needs_writeback; struct btrfs_device *srcdev; struct btrfs_device *tgtdev; struct mutex lock_finishing_cancel_unmount; struct rw_semaphore rwsem; struct btrfs_scrub_progress scrub_progress; struct percpu_counter bio_counter; wait_queue_head_t replace_wait; }; /* * free clusters are used to claim free space in relatively large chunks, * allowing us to do less seeky writes. They are used for all metadata * allocations. In ssd_spread mode they are also used for data allocations. */ struct btrfs_free_cluster { spinlock_t lock; spinlock_t refill_lock; struct rb_root root; /* largest extent in this cluster */ u64 max_size; /* first extent starting offset */ u64 window_start; /* We did a full search and couldn't create a cluster */ bool fragmented; struct btrfs_block_group *block_group; /* * when a cluster is allocated from a block group, we put the * cluster onto a list in the block group so that it can * be freed before the block group is freed. */ struct list_head block_group_list; }; enum btrfs_caching_type { BTRFS_CACHE_NO, BTRFS_CACHE_STARTED, BTRFS_CACHE_FINISHED, BTRFS_CACHE_ERROR, }; /* * Tree to record all locked full stripes of a RAID5/6 block group */ struct btrfs_full_stripe_locks_tree { struct rb_root root; struct mutex lock; }; /* Discard control. */ /* * Async discard uses multiple lists to differentiate the discard filter * parameters. Index 0 is for completely free block groups where we need to * ensure the entire block group is trimmed without being lossy. Indices * afterwards represent monotonically decreasing discard filter sizes to * prioritize what should be discarded next. */ #define BTRFS_NR_DISCARD_LISTS 3 #define BTRFS_DISCARD_INDEX_UNUSED 0 #define BTRFS_DISCARD_INDEX_START 1 struct btrfs_discard_ctl { struct workqueue_struct *discard_workers; struct delayed_work work; spinlock_t lock; struct btrfs_block_group *block_group; struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; u64 prev_discard; u64 prev_discard_time; atomic_t discardable_extents; atomic64_t discardable_bytes; u64 max_discard_size; u64 delay_ms; u32 iops_limit; u32 kbps_limit; u64 discard_extent_bytes; u64 discard_bitmap_bytes; atomic64_t discard_bytes_saved; }; enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, }; void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); /* fs_info */ struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; struct btrfs_balance_control; struct btrfs_delayed_root; /* * Block group or device which contains an active swapfile. Used for preventing * unsafe operations while a swapfile is active. * * These are sorted on (ptr, inode) (note that a block group or device can * contain more than one swapfile). We compare the pointer values because we * don't actually care what the object is, we just need a quick check whether * the object exists in the rbtree. */ struct btrfs_swapfile_pin { struct rb_node node; void *ptr; struct inode *inode; /* * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr * points to a struct btrfs_device. */ bool is_block_group; /* * Only used when 'is_block_group' is true and it is the number of * extents used by a swapfile for this block group ('ptr' field). */ int bg_extent_count; }; enum { BTRFS_FS_BARRIER, BTRFS_FS_CLOSING_START, BTRFS_FS_CLOSING_DONE, BTRFS_FS_LOG_RECOVERING, BTRFS_FS_OPEN, BTRFS_FS_QUOTA_ENABLED, BTRFS_FS_UPDATE_UUID_TREE_GEN, BTRFS_FS_CREATING_FREE_SPACE_TREE, BTRFS_FS_BTREE_ERR, BTRFS_FS_LOG1_ERR, BTRFS_FS_LOG2_ERR, BTRFS_FS_QUOTA_OVERRIDE, /* Used to record internally whether fs has been frozen */ BTRFS_FS_FROZEN, /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. */ BTRFS_FS_BALANCE_RUNNING, /* * Indicate that relocation of a chunk has started, it's set per chunk * and is toggled between chunks. */ BTRFS_FS_RELOC_RUNNING, /* Indicate that the cleaner thread is awake and doing something. */ BTRFS_FS_CLEANER_RUNNING, /* * The checksumming has an optimized version and is considered fast, * so we don't need to offload checksums to workqueues. */ BTRFS_FS_CSUM_IMPL_FAST, /* Indicate that the discard workqueue can service discards. */ BTRFS_FS_DISCARD_RUNNING, /* Indicate that we need to cleanup space cache v1 */ BTRFS_FS_CLEANUP_SPACE_CACHE_V1, /* Indicate that we can't trust the free space tree for caching yet */ BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, /* Indicate whether there are any tree modification log users */ BTRFS_FS_TREE_MOD_LOG_USERS, /* Indicate we have half completed snapshot deletions pending. */ BTRFS_FS_UNFINISHED_DROPS, #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, BTRFS_FS_32BIT_WARN, #endif }; /* * Exclusive operations (device replace, resize, device add/remove, balance) */ enum btrfs_exclusive_operation { BTRFS_EXCLOP_NONE, BTRFS_EXCLOP_BALANCE, BTRFS_EXCLOP_DEV_ADD, BTRFS_EXCLOP_DEV_REMOVE, BTRFS_EXCLOP_DEV_REPLACE, BTRFS_EXCLOP_RESIZE, BTRFS_EXCLOP_SWAP_ACTIVATE, }; struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; struct btrfs_root *extent_root; struct btrfs_root *tree_root; struct btrfs_root *chunk_root; struct btrfs_root *dev_root; struct btrfs_root *fs_root; struct btrfs_root *csum_root; struct btrfs_root *quota_root; struct btrfs_root *uuid_root; struct btrfs_root *free_space_root; struct btrfs_root *data_reloc_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; spinlock_t fs_roots_radix_lock; struct radix_tree_root fs_roots_radix; /* block group cache stuff */ spinlock_t block_group_cache_lock; u64 first_logical_byte; struct rb_root block_group_cache_tree; /* keep track of unallocated space */ atomic64_t free_chunk_space; /* Track ranges which are used by log trees blocks/logged data extents */ struct extent_io_tree excluded_extents; /* logical->physical extent mapping */ struct extent_map_tree mapping_tree; /* * block reservation for extent, checksum, root tree and * delayed dir index item */ struct btrfs_block_rsv global_block_rsv; /* block reservation for metadata operations */ struct btrfs_block_rsv trans_block_rsv; /* block reservation for chunk tree */ struct btrfs_block_rsv chunk_block_rsv; /* block reservation for delayed operations */ struct btrfs_block_rsv delayed_block_rsv; /* block reservation for delayed refs */ struct btrfs_block_rsv delayed_refs_rsv; struct btrfs_block_rsv empty_block_rsv; u64 generation; u64 last_trans_committed; /* * Generation of the last transaction used for block group relocation * since the filesystem was last mounted (or 0 if none happened yet). * Must be written and read while holding btrfs_fs_info::commit_root_sem. */ u64 last_reloc_trans; u64 avg_delayed_ref_runtime; /* * this is updated to the current trans every time a full commit * is required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; unsigned long mount_opt; /* * Track requests for actions that need to be done during transaction * commit (like for some mount options). */ unsigned long pending_changes; unsigned long compress_type:4; unsigned int compress_level; u32 commit_interval; /* * It is a suggestive number, the read side is safe even it gets a * wrong number because we will write out the data into a regular * extent. The write side(mount/remount) is under ->s_umount lock, * so it is also safe. */ u64 max_inline; struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; wait_queue_head_t transaction_wait; wait_queue_head_t transaction_blocked_wait; wait_queue_head_t async_submit_wait; /* * Used to protect the incompat_flags, compat_flags, compat_ro_flags * when they are updated. * * Because we do not clear the flags for ever, so we needn't use * the lock on the read side. * * We also needn't use the lock when we mount the fs, because * there is no other task which will update the flag. */ spinlock_t super_lock; struct btrfs_super_block *super_copy; struct btrfs_super_block *super_for_commit; struct super_block *sb; struct inode *btree_inode; struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; struct mutex chunk_mutex; /* * this is taken to make sure we don't set block groups ro after * the free space cache has been allocated on them */ struct mutex ro_block_group_mutex; /* this is used during read/modify/write to make sure * no two ios are trying to mod the same stripe at the same * time */ struct btrfs_stripe_hash_table *stripe_hash_table; /* * this protects the ordered operations list only while we are * processing all of the entries on it. This way we make * sure the commit code doesn't find the list temporarily empty * because another function happens to be doing non-waiting preflush * before jumping into the main commit. */ struct mutex ordered_operations_mutex; struct rw_semaphore commit_root_sem; struct rw_semaphore cleanup_work_sem; struct rw_semaphore subvol_sem; spinlock_t trans_lock; /* * the reloc mutex goes with the trans lock, it is taken * during commit to protect us from the relocation code */ struct mutex reloc_mutex; struct list_head trans_list; struct list_head dead_roots; struct list_head caching_block_groups; spinlock_t delayed_iput_lock; struct list_head delayed_iputs; atomic_t nr_delayed_iputs; wait_queue_head_t delayed_iputs_wait; atomic64_t tree_mod_seq; /* this protects tree_mod_log and tree_mod_seq_list */ rwlock_t tree_mod_log_lock; struct rb_root tree_mod_log; struct list_head tree_mod_seq_list; atomic_t async_delalloc_pages; /* * this is used to protect the following list -- ordered_roots. */ spinlock_t ordered_root_lock; /* * all fs/file tree roots in which there are data=ordered extents * pending writeback are added into this list. * * these can span multiple transactions and basically include * every dirty data page that isn't from nodatacow */ struct list_head ordered_roots; struct mutex delalloc_root_mutex; spinlock_t delalloc_root_lock; /* all fs/file tree roots that have delalloc inodes. */ struct list_head delalloc_roots; /* * there is a pool of worker threads for checksumming during writes * and a pool for checksumming after reads. This is because readers * can run with FS locks held, and the writers may be waiting for * those locks. We don't want ordering in the pending list to cause * deadlocks, and so the two are serviced separately. * * A third pool does submit_bio to avoid deadlocking with the other * two */ struct btrfs_workqueue *workers; struct btrfs_workqueue *delalloc_workers; struct btrfs_workqueue *flush_workers; struct btrfs_workqueue *endio_workers; struct btrfs_workqueue *endio_meta_workers; struct btrfs_workqueue *endio_raid56_workers; struct btrfs_workqueue *rmw_workers; struct btrfs_workqueue *endio_meta_write_workers; struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; struct btrfs_workqueue *caching_workers; struct btrfs_workqueue *readahead_workers; /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens * for the sys_munmap function call path */ struct btrfs_workqueue *fixup_workers; struct btrfs_workqueue *delayed_workers; struct task_struct *transaction_kthread; struct task_struct *cleaner_kthread; u32 thread_pool_size; struct kobject *space_info_kobj; struct kobject *qgroups_kobj; /* used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; struct percpu_counter ordered_bytes; s32 dirty_metadata_batch; s32 delalloc_batch; struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; /* * The space_info list is effectively read only after initial * setup. It is populated at mount time and cleaned up after * all block groups are removed. RCU is used to protect it. */ struct list_head space_info; struct btrfs_space_info *data_sinfo; struct reloc_control *reloc_ctl; /* data_alloc_cluster is only used in ssd_spread mode */ struct btrfs_free_cluster data_alloc_cluster; /* all metadata allocations go through this cluster */ struct btrfs_free_cluster meta_alloc_cluster; /* auto defrag inodes go here */ spinlock_t defrag_inodes_lock; struct rb_root defrag_inodes; atomic_t defrag_running; /* Used to protect avail_{data, metadata, system}_alloc_bits */ seqlock_t profiles_lock; /* * these three are in extended format (availability of single * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits) */ u64 avail_data_alloc_bits; u64 avail_metadata_alloc_bits; u64 avail_system_alloc_bits; /* restriper state */ spinlock_t balance_lock; struct mutex balance_mutex; atomic_t balance_pause_req; atomic_t balance_cancel_req; struct btrfs_balance_control *balance_ctl; wait_queue_head_t balance_wait_q; /* Cancellation requests for chunk relocation */ atomic_t reloc_cancel_req; u32 data_chunk_allocations; u32 metadata_ratio; void *bdev_holder; /* private scrub information */ struct mutex scrub_lock; atomic_t scrubs_running; atomic_t scrub_pause_req; atomic_t scrubs_paused; atomic_t scrub_cancel_req; wait_queue_head_t scrub_pause_wait; /* * The worker pointers are NULL iff the refcount is 0, ie. scrub is not * running. */ refcount_t scrub_workers_refcnt; struct btrfs_workqueue *scrub_workers; struct btrfs_workqueue *scrub_wr_completion_workers; struct btrfs_workqueue *scrub_parity_workers; struct btrfs_discard_ctl discard_ctl; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; #endif /* is qgroup tracking in a consistent state? */ u64 qgroup_flags; /* holds configuration and tracking. Protected by qgroup_lock */ struct rb_root qgroup_tree; spinlock_t qgroup_lock; /* * used to avoid frequently calling ulist_alloc()/ulist_free() * when doing qgroup accounting, it must be protected by qgroup_lock. */ struct ulist *qgroup_ulist; /* * Protect user change for quota operations. If a transaction is needed, * it must be started before locking this lock. */ struct mutex qgroup_ioctl_lock; /* list of dirty qgroups to be written at next commit */ struct list_head dirty_qgroups; /* used by qgroup for an efficient tree traversal */ u64 qgroup_seq; /* qgroup rescan items */ struct mutex qgroup_rescan_lock; /* protects the progress item */ struct btrfs_key qgroup_rescan_progress; struct btrfs_workqueue *qgroup_rescan_workers; struct completion qgroup_rescan_completion; struct btrfs_work qgroup_rescan_work; bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */ /* filesystem state */ unsigned long fs_state; struct btrfs_delayed_root *delayed_root; /* readahead tree */ spinlock_t reada_lock; struct radix_tree_root reada_tree; /* readahead works cnt */ atomic_t reada_works_cnt; /* Extent buffer radix tree */ spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ struct radix_tree_root buffer_radix; /* next backup root to be overwritten */ int backup_root_index; /* device replace state */ struct btrfs_dev_replace dev_replace; struct semaphore uuid_tree_rescan_sem; /* Used to reclaim the metadata space in the background. */ struct work_struct async_reclaim_work; struct work_struct async_data_reclaim_work; struct work_struct preempt_reclaim_work; /* Reclaim partially filled block groups in the background */ struct work_struct reclaim_bgs_work; struct list_head reclaim_bgs; int bg_reclaim_threshold; spinlock_t unused_bgs_lock; struct list_head unused_bgs; struct mutex unused_bg_unpin_mutex; /* Protect block groups that are going to be deleted */ struct mutex reclaim_bgs_lock; /* Cached block sizes */ u32 nodesize; u32 sectorsize; /* ilog2 of sectorsize, use to avoid 64bit division */ u32 sectorsize_bits; u32 csum_size; u32 csums_per_leaf; u32 stripesize; /* * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular * filesystem, on zoned it depends on the device constraints. */ u64 max_extent_size; /* Block groups and devices containing active swapfiles. */ spinlock_t swapfile_pins_lock; struct rb_root swapfile_pins; struct crypto_shash *csum_shash; /* Type of exclusive operation running, protected by super_lock */ enum btrfs_exclusive_operation exclusive_operation; /* * Zone size > 0 when in ZONED mode, otherwise it's used for a check * if the mode is enabled */ union { u64 zone_size; u64 zoned; }; /* Max size to emit ZONE_APPEND write command */ u64 max_zone_append_size; struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; u64 treelog_bg; /* * Start of the dedicated data relocation block group, protected by * relocation_bg_lock. */ spinlock_t relocation_bg_lock; u64 data_reloc_bg; struct mutex zoned_data_reloc_io_lock; #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; #endif #ifdef CONFIG_BTRFS_DEBUG struct kobject *debug_kobj; struct kobject *discard_debug_kobj; struct list_head allocated_roots; spinlock_t eb_leak_lock; struct list_head allocated_ebs; #endif }; static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; } /* * The state of btrfs root */ enum { /* * btrfs_record_root_in_trans is a multi-step process, and it can race * with the balancing code. But the race is very small, and only the * first time the root is added to each transaction. So IN_TRANS_SETUP * is used to tell us when more checks are required */ BTRFS_ROOT_IN_TRANS_SETUP, /* * Set if tree blocks of this root can be shared by other roots. * Only subvolume trees and their reloc trees have this bit set. * Conflicts with TRACK_DIRTY bit. * * This affects two things: * * - How balance works * For shareable roots, we need to use reloc tree and do path * replacement for balance, and need various pre/post hooks for * snapshot creation to handle them. * * While for non-shareable trees, we just simply do a tree search * with COW. * * - How dirty roots are tracked * For shareable roots, btrfs_record_root_in_trans() is needed to * track them, while non-subvolume roots have TRACK_DIRTY bit, they * don't need to set this manually. */ BTRFS_ROOT_SHAREABLE, BTRFS_ROOT_TRACK_DIRTY, BTRFS_ROOT_IN_RADIX, BTRFS_ROOT_ORPHAN_ITEM_INSERTED, BTRFS_ROOT_DEFRAG_RUNNING, BTRFS_ROOT_FORCE_COW, BTRFS_ROOT_MULTI_LOG_TASKS, BTRFS_ROOT_DIRTY, BTRFS_ROOT_DELETING, /* * Reloc tree is orphan, only kept here for qgroup delayed subtree scan * * Set for the subvolume tree owning the reloc tree. */ BTRFS_ROOT_DEAD_RELOC_TREE, /* Mark dead root stored on device whose cleanup needs to be resumed */ BTRFS_ROOT_DEAD_TREE, /* The root has a log tree. Used for subvolume roots and the tree root. */ BTRFS_ROOT_HAS_LOG_TREE, /* Qgroup flushing is in progress */ BTRFS_ROOT_QGROUP_FLUSHING, /* This root has a drop operation that was started previously. */ BTRFS_ROOT_UNFINISHED_DROP, /* This reloc root needs to have its buffers lockdep class reset. */ BTRFS_ROOT_RESET_LOCKDEP_CLASS, }; static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) { clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); } /* * Record swapped tree blocks of a subvolume tree for delayed subtree trace * code. For detail check comment in fs/btrfs/qgroup.c. */ struct btrfs_qgroup_swapped_blocks { spinlock_t lock; /* RM_EMPTY_ROOT() of above blocks[] */ bool swapped; struct rb_root blocks[BTRFS_MAX_LEVEL]; }; /* * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. */ struct btrfs_root { struct extent_buffer *node; struct extent_buffer *commit_root; struct btrfs_root *log_root; struct btrfs_root *reloc_root; unsigned long state; struct btrfs_root_item root_item; struct btrfs_key root_key; struct btrfs_fs_info *fs_info; struct extent_io_tree dirty_log_pages; struct mutex objectid_mutex; spinlock_t accounting_lock; struct btrfs_block_rsv *block_rsv; struct mutex log_mutex; wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; struct list_head log_ctxs[2]; /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_writers; atomic_t log_commit[2]; /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_batch; int log_transid; /* No matter the commit succeeds or not*/ int log_transid_committed; /* Just be updated when the commit succeeds. */ int last_log_commit; pid_t log_start_pid; u64 last_trans; u32 type; u64 free_objectid; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; /* The dirty list is only used by non-shareable roots */ struct list_head dirty_list; struct list_head root_list; spinlock_t log_extents_lock[2]; struct list_head logged_list[2]; int orphan_cleanup_state; spinlock_t inode_lock; /* red-black tree that keeps track of in-memory inodes */ struct rb_root inode_tree; /* * radix tree that keeps track of delayed nodes of every inode, * protected by inode_lock */ struct radix_tree_root delayed_nodes_tree; /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later */ dev_t anon_dev; spinlock_t root_item_lock; refcount_t refs; struct mutex delalloc_mutex; spinlock_t delalloc_lock; /* * all of the inodes that have delalloc bytes. It is possible for * this list to be empty even when there is still dirty data=ordered * extents waiting to finish IO. */ struct list_head delalloc_inodes; struct list_head delalloc_root; u64 nr_delalloc_inodes; struct mutex ordered_extent_mutex; /* * this is used by the balancing code to wait for all the pending * ordered extents */ spinlock_t ordered_extent_lock; /* * all of the data=ordered extents pending writeback * these can span multiple transactions and basically include * every dirty data page that isn't from nodatacow */ struct list_head ordered_extents; struct list_head ordered_root; u64 nr_ordered_extents; /* * Not empty if this subvolume root has gone through tree block swap * (relocation) * * Will be used by reloc_control::dirty_subvol_roots. */ struct list_head reloc_dirty_list; /* * Number of currently running SEND ioctls to prevent * manipulation with the read-only status via SUBVOL_SETFLAGS */ int send_in_progress; /* * Number of currently running deduplication operations that have a * destination inode belonging to this root. Protected by the lock * root_item_lock. */ int dedupe_in_progress; /* For exclusion of snapshot creation and nocow writes */ struct btrfs_drew_lock snapshot_lock; atomic_t snapshot_force_cow; /* For qgroup metadata reserved space */ spinlock_t qgroup_meta_rsv_lock; u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; wait_queue_head_t qgroup_flush_wait; /* Number of active swapfiles */ atomic_t nr_swapfiles; /* Record pairs of swapped blocks for qgroup */ struct btrfs_qgroup_swapped_blocks swapped_blocks; /* Used only by log trees, when logging csum items */ struct extent_io_tree log_csum_range; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; #endif }; /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. */ struct btrfs_replace_extent_info { u64 disk_offset; u64 disk_len; u64 data_offset; u64 data_len; u64 file_offset; /* Pointer to a file extent item of type regular or prealloc. */ char *extent_buf; /* * Set to true when attempting to replace a file range with a new extent * described by this structure, set to false when attempting to clone an * existing extent into a file range. */ bool is_new_extent; /* Meaningful only if is_new_extent is true. */ int qgroup_reserved; /* * Meaningful only if is_new_extent is true. * Used to track how many extent items we have already inserted in a * subvolume tree that refer to the extent described by this structure, * so that we know when to create a new delayed ref or update an existing * one. */ int insertions; }; /* Arguments for btrfs_drop_extents() */ struct btrfs_drop_extents_args { /* Input parameters */ /* * If NULL, btrfs_drop_extents() will allocate and free its own path. * If 'replace_extent' is true, this must not be NULL. Also the path * is always released except if 'replace_extent' is true and * btrfs_drop_extents() sets 'extent_inserted' to true, in which case * the path is kept locked. */ struct btrfs_path *path; /* Start offset of the range to drop extents from */ u64 start; /* End (exclusive, last byte + 1) of the range to drop extents from */ u64 end; /* If true drop all the extent maps in the range */ bool drop_cache; /* * If true it means we want to insert a new extent after dropping all * the extents in the range. If this is true, the 'extent_item_size' * parameter must be set as well and the 'extent_inserted' field will * be set to true by btrfs_drop_extents() if it could insert the new * extent. * Note: when this is set to true the path must not be NULL. */ bool replace_extent; /* * Used if 'replace_extent' is true. Size of the file extent item to * insert after dropping all existing extents in the range */ u32 extent_item_size; /* Output parameters */ /* * Set to the minimum between the input parameter 'end' and the end * (exclusive, last byte + 1) of the last dropped extent. This is always * set even if btrfs_drop_extents() returns an error. */ u64 drop_end; /* * The number of allocated bytes found in the range. This can be smaller * than the range's length when there are holes in the range. */ u64 bytes_found; /* * Only set if 'replace_extent' is true. Set to true if we were able * to insert a replacement extent after dropping all extents in the * range, otherwise set to false by btrfs_drop_extents(). * Also, if btrfs_drop_extents() has set this to true it means it * returned with the path locked, otherwise if it has set this to * false it has returned with the path released. */ bool extent_inserted; }; struct btrfs_file_private { void *filldir_buf; u64 last_index; }; static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { return info->nodesize - sizeof(struct btrfs_header); } #define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items) static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info) { return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item); } static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info) { return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr); } #define BTRFS_FILE_EXTENT_INLINE_DATA_START \ (offsetof(struct btrfs_file_extent_item, disk_bytenr)) static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info) { return BTRFS_MAX_ITEM_SIZE(info) - BTRFS_FILE_EXTENT_INLINE_DATA_START; } static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) { return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } /* * Flags for mount options. * * Note: don't forget to add new options to btrfs_show_options() */ enum { BTRFS_MOUNT_NODATASUM = (1UL << 0), BTRFS_MOUNT_NODATACOW = (1UL << 1), BTRFS_MOUNT_NOBARRIER = (1UL << 2), BTRFS_MOUNT_SSD = (1UL << 3), BTRFS_MOUNT_DEGRADED = (1UL << 4), BTRFS_MOUNT_COMPRESS = (1UL << 5), BTRFS_MOUNT_NOTREELOG = (1UL << 6), BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), BTRFS_MOUNT_NOSSD = (1UL << 9), BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), BTRFS_MOUNT_REF_VERIFY = (1UL << 27), BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), }; #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ BTRFS_MOUNT_##opt) #define btrfs_set_and_info(fs_info, opt, fmt, args...) \ do { \ if (!btrfs_test_opt(fs_info, opt)) \ btrfs_info(fs_info, fmt, ##args); \ btrfs_set_opt(fs_info->mount_opt, opt); \ } while (0) #define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ do { \ if (btrfs_test_opt(fs_info, opt)) \ btrfs_info(fs_info, fmt, ##args); \ btrfs_clear_opt(fs_info->mount_opt, opt); \ } while (0) /* * Requests for changes that need to be done during transaction commit. * * Internal mount options that are used for special handling of the real * mount options (eg. cannot be set during remount and have to be set during * transaction commit) */ #define BTRFS_PENDING_COMMIT (0) #define btrfs_test_pending(info, opt) \ test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) #define btrfs_set_pending(info, opt) \ set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) #define btrfs_clear_pending(info, opt) \ clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) /* * Helpers for setting pending mount option changes. * * Expects corresponding macros * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name */ #define btrfs_set_pending_and_info(info, opt, fmt, args...) \ do { \ if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \ btrfs_info((info), fmt, ##args); \ btrfs_set_pending((info), SET_##opt); \ btrfs_clear_pending((info), CLEAR_##opt); \ } \ } while(0) #define btrfs_clear_pending_and_info(info, opt, fmt, args...) \ do { \ if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \ btrfs_info((info), fmt, ##args); \ btrfs_set_pending((info), CLEAR_##opt); \ btrfs_clear_pending((info), SET_##opt); \ } \ } while(0) /* * Inode flags */ #define BTRFS_INODE_NODATASUM (1U << 0) #define BTRFS_INODE_NODATACOW (1U << 1) #define BTRFS_INODE_READONLY (1U << 2) #define BTRFS_INODE_NOCOMPRESS (1U << 3) #define BTRFS_INODE_PREALLOC (1U << 4) #define BTRFS_INODE_SYNC (1U << 5) #define BTRFS_INODE_IMMUTABLE (1U << 6) #define BTRFS_INODE_APPEND (1U << 7) #define BTRFS_INODE_NODUMP (1U << 8) #define BTRFS_INODE_NOATIME (1U << 9) #define BTRFS_INODE_DIRSYNC (1U << 10) #define BTRFS_INODE_COMPRESS (1U << 11) #define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31) #define BTRFS_INODE_FLAG_MASK \ (BTRFS_INODE_NODATASUM | \ BTRFS_INODE_NODATACOW | \ BTRFS_INODE_READONLY | \ BTRFS_INODE_NOCOMPRESS | \ BTRFS_INODE_PREALLOC | \ BTRFS_INODE_SYNC | \ BTRFS_INODE_IMMUTABLE | \ BTRFS_INODE_APPEND | \ BTRFS_INODE_NODUMP | \ BTRFS_INODE_NOATIME | \ BTRFS_INODE_DIRSYNC | \ BTRFS_INODE_COMPRESS | \ BTRFS_INODE_ROOT_ITEM_INIT) #define BTRFS_INODE_RO_VERITY (1U << 0) #define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY) struct btrfs_map_token { struct extent_buffer *eb; char *kaddr; unsigned long offset; }; #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ ((bytes) >> (fs_info)->sectorsize_bits) static inline void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb) { token->eb = eb; token->kaddr = page_address(eb->pages[0]); token->offset = 0; } /* some macros to generate set/get functions for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: */ #define le8_to_cpu(v) (v) #define cpu_to_le8(v) (v) #define __le8 u8 static inline u8 get_unaligned_le8(const void *p) { return *(u8 *)p; } static inline void put_unaligned_le8(u8 val, void *p) { *(u8 *)p = val; } #define read_eb_member(eb, ptr, type, member, result) (\ read_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ sizeof(((type *)0)->member))) #define write_eb_member(eb, ptr, type, member, result) (\ write_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ sizeof(((type *)0)->member))) #define DECLARE_BTRFS_SETGET_BITS(bits) \ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ const void *ptr, unsigned long off); \ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ const void *ptr, unsigned long off, \ u##bits val); \ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off); \ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ unsigned long off, u##bits val); DECLARE_BTRFS_SETGET_BITS(8) DECLARE_BTRFS_SETGET_BITS(16) DECLARE_BTRFS_SETGET_BITS(32) DECLARE_BTRFS_SETGET_BITS(64) #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ const type *s) \ { \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ return btrfs_get_##bits(eb, s, offsetof(type, member)); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ u##bits val) \ { \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ } \ static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ const type *s) \ { \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ return btrfs_get_token_##bits(token, s, offsetof(type, member));\ } \ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ type *s, u##bits val) \ { \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ } #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ const type *p = page_address(eb->pages[0]) + \ offset_in_page(eb->start); \ return get_unaligned_le##bits(&p->member); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, \ u##bits val) \ { \ type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \ put_unaligned_le##bits(val, &p->member); \ } #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const type *s) \ { \ return get_unaligned_le##bits(&s->member); \ } \ static inline void btrfs_set_##name(type *s, u##bits val) \ { \ put_unaligned_le##bits(val, &s->member); \ } static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { BUILD_BUG_ON(sizeof(u64) != sizeof(((struct btrfs_dev_item *)0))->total_bytes); return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes)); } static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s, u64 val) { BUILD_BUG_ON(sizeof(u64) != sizeof(((struct btrfs_dev_item *)0))->total_bytes); WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); } BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, start_offset, 64); BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, total_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, io_align, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, io_width, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, sector_size, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, dev_group, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, seek_speed, 8); BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, bandwidth, 8); BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, generation, 64); static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) { return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); } static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) { return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); } BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) { return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); } BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, io_align, 32); BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, io_width, 32); BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, sector_size, 32); BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, int nr) { unsigned long offset = (unsigned long)c; offset += offsetof(struct btrfs_chunk, stripe); offset += nr * sizeof(struct btrfs_stripe); return (struct btrfs_stripe *)offset; } static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) { return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); } static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb, struct btrfs_chunk *c, int nr) { return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); } static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb, struct btrfs_chunk *c, int nr) { return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); } /* struct btrfs_block_group_item */ BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item, used, 64); BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64); BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid, struct btrfs_block_group_item, chunk_objectid, 64); BTRFS_SETGET_FUNCS(block_group_chunk_objectid, struct btrfs_block_group_item, chunk_objectid, 64); BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, struct btrfs_block_group_item, flags, 64); /* struct btrfs_free_space_info */ BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, extent_count, 32); BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); /* struct btrfs_inode_ref */ BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); /* struct btrfs_inode_extref */ BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, parent_objectid, 64); BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, name_len, 16); BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); /* struct btrfs_inode_item */ BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, sequence, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, transid, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, nbytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, block_group, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); /* struct btrfs_dev_extent */ BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64); BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, chunk_objectid, 64); BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, chunk_offset, 64); BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64); BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); static inline void btrfs_tree_block_key(const struct extent_buffer *eb, struct btrfs_tree_block_info *item, struct btrfs_disk_key *key) { read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); } static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb, struct btrfs_tree_block_info *item, struct btrfs_disk_key *key) { write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); } BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, root, 64); BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, objectid, 64); BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, offset, 64); BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 32); BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32); BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, type, 8); BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, offset, 64); static inline u32 btrfs_extent_inline_ref_size(int type) { if (type == BTRFS_TREE_BLOCK_REF_KEY || type == BTRFS_SHARED_BLOCK_REF_KEY) return sizeof(struct btrfs_extent_inline_ref); if (type == BTRFS_SHARED_DATA_REF_KEY) return sizeof(struct btrfs_shared_data_ref) + sizeof(struct btrfs_extent_inline_ref); if (type == BTRFS_EXTENT_DATA_REF_KEY) return sizeof(struct btrfs_extent_data_ref) + offsetof(struct btrfs_extent_inline_ref, offset); return 0; } /* struct btrfs_node */ BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, blockptr, 64); BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, generation, 64); static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); } static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb, int nr, u64 val) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); } static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); } static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb, int nr, u64 val) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); } static inline unsigned long btrfs_node_key_ptr_offset(int nr) { return offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; } void btrfs_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr); static inline void btrfs_set_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { unsigned long ptr; ptr = btrfs_node_key_ptr_offset(nr); write_eb_member(eb, (struct btrfs_key_ptr *)ptr, struct btrfs_key_ptr, key, disk_key); } /* struct btrfs_item */ BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); static inline unsigned long btrfs_item_nr_offset(int nr) { return offsetof(struct btrfs_leaf, items) + sizeof(struct btrfs_item) * nr; } static inline struct btrfs_item *btrfs_item_nr(int nr) { return (struct btrfs_item *)btrfs_item_nr_offset(nr); } static inline u32 btrfs_item_end(const struct extent_buffer *eb, struct btrfs_item *item) { return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); } static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr) { return btrfs_item_end(eb, btrfs_item_nr(nr)); } static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr) { return btrfs_item_offset(eb, btrfs_item_nr(nr)); } static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr) { return btrfs_item_size(eb, btrfs_item_nr(nr)); } static inline void btrfs_item_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { struct btrfs_item *item = btrfs_item_nr(nr); read_eb_member(eb, item, struct btrfs_item, key, disk_key); } static inline void btrfs_set_item_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { struct btrfs_item *item = btrfs_item_nr(nr); write_eb_member(eb, item, struct btrfs_item, key, disk_key); } BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); /* * struct btrfs_root_ref */ BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); /* struct btrfs_dir_item */ BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8); BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, data_len, 16); BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, name_len, 16); BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64); static inline void btrfs_dir_item_key(const struct extent_buffer *eb, const struct btrfs_dir_item *item, struct btrfs_disk_key *key) { read_eb_member(eb, item, struct btrfs_dir_item, location, key); } static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, struct btrfs_dir_item *item, const struct btrfs_disk_key *key) { write_eb_member(eb, item, struct btrfs_dir_item, location, key); } BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, num_entries, 64); BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, num_bitmaps, 64); BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, generation, 64); static inline void btrfs_free_space_key(const struct extent_buffer *eb, const struct btrfs_free_space_header *h, struct btrfs_disk_key *key) { read_eb_member(eb, h, struct btrfs_free_space_header, location, key); } static inline void btrfs_set_free_space_key(struct extent_buffer *eb, struct btrfs_free_space_header *h, const struct btrfs_disk_key *key) { write_eb_member(eb, h, struct btrfs_free_space_header, location, key); } /* struct btrfs_disk_key */ BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, objectid, 64); BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); #ifdef __LITTLE_ENDIAN /* * Optimized helpers for little-endian architectures where CPU and on-disk * structures have the same endianness and we can skip conversions. */ static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key, const struct btrfs_disk_key *disk_key) { memcpy(cpu_key, disk_key, sizeof(struct btrfs_key)); } static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key, const struct btrfs_key *cpu_key) { memcpy(disk_key, cpu_key, sizeof(struct btrfs_key)); } static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, struct btrfs_key *cpu_key, int nr) { struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; btrfs_node_key(eb, disk_key, nr); } static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, struct btrfs_key *cpu_key, int nr) { struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; btrfs_item_key(eb, disk_key, nr); } static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, const struct btrfs_dir_item *item, struct btrfs_key *cpu_key) { struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; btrfs_dir_item_key(eb, item, disk_key); } #else static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, const struct btrfs_disk_key *disk) { cpu->offset = le64_to_cpu(disk->offset); cpu->type = disk->type; cpu->objectid = le64_to_cpu(disk->objectid); } static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, const struct btrfs_key *cpu) { disk->offset = cpu_to_le64(cpu->offset); disk->type = cpu->type; disk->objectid = cpu_to_le64(cpu->objectid); } static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, struct btrfs_key *key, int nr) { struct btrfs_disk_key disk_key; btrfs_node_key(eb, &disk_key, nr); btrfs_disk_key_to_cpu(key, &disk_key); } static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, struct btrfs_key *key, int nr) { struct btrfs_disk_key disk_key; btrfs_item_key(eb, &disk_key, nr); btrfs_disk_key_to_cpu(key, &disk_key); } static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, const struct btrfs_dir_item *item, struct btrfs_key *key) { struct btrfs_disk_key disk_key; btrfs_dir_item_key(eb, item, &disk_key); btrfs_disk_key_to_cpu(key, &disk_key); } #endif /* struct btrfs_header */ BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, generation, 64); BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, nritems, 32); BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) { return (btrfs_header_flags(eb) & flag) == flag; } static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) { u64 flags = btrfs_header_flags(eb); btrfs_set_header_flags(eb, flags | flag); } static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) { u64 flags = btrfs_header_flags(eb); btrfs_set_header_flags(eb, flags & ~flag); } static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) { u64 flags = btrfs_header_flags(eb); return flags >> BTRFS_BACKREF_REV_SHIFT; } static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, int rev) { u64 flags = btrfs_header_flags(eb); flags &= ~BTRFS_BACKREF_REV_MASK; flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; btrfs_set_header_flags(eb, flags); } static inline int btrfs_is_leaf(const struct extent_buffer *eb) { return btrfs_header_level(eb) == 0; } /* struct btrfs_root_item */ BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, generation, 64); BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8); BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, last_snapshot, 64); BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, generation_v2, 64); BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, ctransid, 64); BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, otransid, 64); BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, stransid, 64); BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64); static inline bool btrfs_root_readonly(const struct btrfs_root *root) { /* Byte-swap the constant at compile time, root_item::flags is LE */ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; } static inline bool btrfs_root_dead(const struct btrfs_root *root) { /* Byte-swap the constant at compile time, root_item::flags is LE */ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; } static inline u64 btrfs_root_id(const struct btrfs_root *root) { return root->root_key.objectid; } /* struct btrfs_root_backup */ BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, tree_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, tree_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, tree_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, chunk_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, chunk_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, chunk_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, extent_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, extent_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, extent_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, fs_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, fs_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, fs_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, dev_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, dev_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, dev_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, csum_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, csum_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, csum_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, total_bytes, 64); BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, num_devices, 64); /* struct btrfs_balance_item */ BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); static inline void btrfs_balance_data(const struct extent_buffer *eb, const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); } static inline void btrfs_set_balance_data(struct extent_buffer *eb, struct btrfs_balance_item *bi, const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); } static inline void btrfs_balance_meta(const struct extent_buffer *eb, const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); } static inline void btrfs_set_balance_meta(struct extent_buffer *eb, struct btrfs_balance_item *bi, const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); } static inline void btrfs_balance_sys(const struct extent_buffer *eb, const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } static inline void btrfs_set_balance_sys(struct extent_buffer *eb, struct btrfs_balance_item *bi, const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, const struct btrfs_disk_balance_args *disk) { memset(cpu, 0, sizeof(*cpu)); cpu->profiles = le64_to_cpu(disk->profiles); cpu->usage = le64_to_cpu(disk->usage); cpu->devid = le64_to_cpu(disk->devid); cpu->pstart = le64_to_cpu(disk->pstart); cpu->pend = le64_to_cpu(disk->pend); cpu->vstart = le64_to_cpu(disk->vstart); cpu->vend = le64_to_cpu(disk->vend); cpu->target = le64_to_cpu(disk->target); cpu->flags = le64_to_cpu(disk->flags); cpu->limit = le64_to_cpu(disk->limit); cpu->stripes_min = le32_to_cpu(disk->stripes_min); cpu->stripes_max = le32_to_cpu(disk->stripes_max); } static inline void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, const struct btrfs_balance_args *cpu) { memset(disk, 0, sizeof(*disk)); disk->profiles = cpu_to_le64(cpu->profiles); disk->usage = cpu_to_le64(cpu->usage); disk->devid = cpu_to_le64(cpu->devid); disk->pstart = cpu_to_le64(cpu->pstart); disk->pend = cpu_to_le64(cpu->pend); disk->vstart = cpu_to_le64(cpu->vstart); disk->vend = cpu_to_le64(cpu->vend); disk->target = cpu_to_le64(cpu->target); disk->flags = cpu_to_le64(cpu->flags); disk->limit = cpu_to_le64(cpu->limit); disk->stripes_min = cpu_to_le32(cpu->stripes_min); disk->stripes_max = cpu_to_le32(cpu->stripes_max); } /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, generation, 64); BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, struct btrfs_super_block, sys_chunk_array_size, 32); BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, struct btrfs_super_block, chunk_root_generation, 64); BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, chunk_root, 64); BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, chunk_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64); BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block, log_root_transid, 64); BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, log_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, total_bytes, 64); BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, sectorsize, 32); BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, nodesize, 32); BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, stripesize, 32); BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, root_dir_objectid, 64); BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, num_devices, 64); BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, compat_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, compat_ro_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, incompat_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, csum_type, 16); BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, cache_generation, 64); BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); const char *btrfs_super_csum_driver(u16 csum_type); size_t __attribute_const__ btrfs_get_num_csums(void); /* * The leaf data grows from end-to-front in the node. * this returns the address of the start of the last item, * which is the stop of the leaf data stack */ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) { u32 nr = btrfs_header_nritems(leaf); if (nr == 0) return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); return btrfs_item_offset_nr(leaf, nr - 1); } /* struct btrfs_file_extent_item */ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, type, 8); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, struct btrfs_file_extent_item, disk_bytenr, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, struct btrfs_file_extent_item, offset, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, struct btrfs_file_extent_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, struct btrfs_file_extent_item, num_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes, struct btrfs_file_extent_item, ram_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, struct btrfs_file_extent_item, disk_num_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, struct btrfs_file_extent_item, compression, 8); static inline unsigned long btrfs_file_extent_inline_start(const struct btrfs_file_extent_item *e) { return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; } static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) { return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; } BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, disk_bytenr, 64); BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, generation, 64); BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, disk_num_bytes, 64); BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, offset, 64); BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, num_bytes, 64); BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, ram_bytes, 64); BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, compression, 8); BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, encryption, 8); BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, other_encoding, 16); /* * this returns the number of bytes used by the item on disk, minus the * size of any extent headers. If a file is compressed on disk, this is * the compressed size */ static inline u32 btrfs_file_extent_inline_item_len( const struct extent_buffer *eb, struct btrfs_item *e) { return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START; } /* btrfs_qgroup_status_item */ BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, generation, 64); BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, version, 64); BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, flags, 64); BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, rescan, 64); /* btrfs_qgroup_info_item */ BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, generation, 64); BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, rfer_cmpr, 64); BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, excl_cmpr, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, struct btrfs_qgroup_info_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, rfer_cmpr, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, excl_cmpr, 64); /* btrfs_qgroup_limit_item */ BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, flags, 64); BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, max_rfer, 64); BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, max_excl, 64); BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, rsv_rfer, 64); BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, rsv_excl, 64); /* btrfs_dev_replace_item */ BTRFS_SETGET_FUNCS(dev_replace_src_devid, struct btrfs_dev_replace_item, src_devid, 64); BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, 64); BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, replace_state, 64); BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, time_started, 64); BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, time_stopped, 64); BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, num_write_errors, 64); BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, struct btrfs_dev_replace_item, num_uncorrectable_read_errors, 64); BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, cursor_left, 64); BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, cursor_right, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, struct btrfs_dev_replace_item, src_devid, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, struct btrfs_dev_replace_item, replace_state, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, struct btrfs_dev_replace_item, time_started, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, struct btrfs_dev_replace_item, time_stopped, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, struct btrfs_dev_replace_item, num_write_errors, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, struct btrfs_dev_replace_item, num_uncorrectable_read_errors, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, struct btrfs_dev_replace_item, cursor_left, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, struct btrfs_dev_replace_item, cursor_right, 64); /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) #define btrfs_item_ptr_offset(leaf, slot) \ ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) { return crc32c(crc, address, length); } static inline void btrfs_crc32c_final(u32 crc, u8 *result) { put_unaligned_le32(~crc, result); } static inline u64 btrfs_name_hash(const char *name, int len) { return crc32c((u32)~1, name, len); } /* * Figure the key offset of an extended inode ref */ static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, int len) { return (u64) crc32c(parent_objectid, name, len); } static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) { return mapping_gfp_constraint(mapping, ~__GFP_FS); } /* extent-tree.c */ enum btrfs_inline_ref_type { BTRFS_REF_TYPE_INVALID, BTRFS_REF_TYPE_BLOCK, BTRFS_REF_TYPE_DATA, BTRFS_REF_TYPE_ANY, }; int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); /* * Take the number of bytes to be checksummmed and figure out how many leaves * it would require to store the csums for that many bytes. */ static inline u64 btrfs_csum_bytes_to_leaves( const struct btrfs_fs_info *fs_info, u64 csum_bytes) { const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); } /* * Use this if we would be adding new items, as we could split nodes as we cow * down the tree. */ static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; } /* * Doing a truncate or a modification won't result in new nodes or leaves, just * what we need for COW. */ static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes); void btrfs_free_excluded_extents(struct btrfs_block_group *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, int reserved); int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr, bool strict); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size, enum btrfs_lock_nesting nest); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 root_id, struct extent_buffer *buf, u64 parent, int last_ref); int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 owner, u64 offset, u64 ram_bytes, struct btrfs_key *ins); int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins); int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags, int level, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); /* * Different levels for to flush space when doing space reservations. * * The higher the level, the more methods we try to reclaim space. */ enum btrfs_reserve_flush_enum { /* If we are in the transaction, we can't flush anything.*/ BTRFS_RESERVE_NO_FLUSH, /* * Flush space by: * - Running delayed inode items * - Allocating a new chunk */ BTRFS_RESERVE_FLUSH_LIMIT, /* * Flush space by: * - Running delayed inode items * - Running delayed refs * - Running delalloc and waiting for ordered extents * - Allocating a new chunk */ BTRFS_RESERVE_FLUSH_EVICT, /* * Flush space by above mentioned methods and by: * - Running delayed iputs * - Committing transaction * * Can be interrupted by a fatal signal. */ BTRFS_RESERVE_FLUSH_DATA, BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, BTRFS_RESERVE_FLUSH_ALL, /* * Pretty much the same as FLUSH_ALL, but can also steal space from * global rsv. * * Can be interrupted by a fatal signal. */ BTRFS_RESERVE_FLUSH_ALL_STEAL, }; enum btrfs_flush_state { FLUSH_DELAYED_ITEMS_NR = 1, FLUSH_DELAYED_ITEMS = 2, FLUSH_DELAYED_REFS_NR = 3, FLUSH_DELAYED_REFS = 4, FLUSH_DELALLOC = 5, FLUSH_DELALLOC_WAIT = 6, FLUSH_DELALLOC_FULL = 7, ALLOC_CHUNK = 8, ALLOC_CHUNK_FORCE = 9, RUN_DELAYED_IPUTS = 10, COMMIT_TRANS = 11, }; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_start_write_no_snapshotting(struct btrfs_root *root); void btrfs_end_write_no_snapshotting(struct btrfs_root *root); void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, u64 min_trans); int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_path *path, u64 min_trans); struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, int slot); int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, enum btrfs_lock_nesting nest); int btrfs_force_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, u64 search_start, u64 empty_size, enum btrfs_lock_nesting nest); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); void btrfs_extend_item(struct btrfs_path *path, u32 data_size); void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *new_key, unsigned long split_offset); int btrfs_duplicate_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *new_key); int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key); int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow); int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, u64 time_seq); int btrfs_search_slot_for_read(struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int find_higher, int return_any); int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, int start_slot, u64 *last_ret, struct btrfs_key *progress); void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); static inline int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path) { return btrfs_del_items(trans, root, path, path->slots[0], 1); } void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, int nr); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, int nr); static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *key, u32 data_size) { return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1); } int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *path); static inline int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *p, u64 time_seq) { ++p->slots[0]; if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) return btrfs_next_old_leaf(root, p, time_seq); return 0; } /* * Search the tree again to find a leaf with greater keys. * * Returns 0 if it found something or 1 if there are no greater leaves. * Returns < 0 on error. */ static inline int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) { return btrfs_next_old_leaf(root, path, 0); } static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { return btrfs_next_old_item(root, p, 0); } int btrfs_leaf_free_space(struct extent_buffer *leaf); int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent); static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) { /* * Do it this way so we only ever do one test_bit in the normal case. */ if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) return 2; return 1; } return 0; } /* * If we remount the fs to be R/O or umount the fs, the cleaner needn't do * anything except sleeping. This function is used to check the status of * the fs. * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount, * since setting and checking for SB_RDONLY in the superblock's flags is not * atomic. */ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) { return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || btrfs_fs_closing(fs_info); } static inline void btrfs_set_sb_rdonly(struct super_block *sb) { sb->s_flags |= SB_RDONLY; set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); } static inline void btrfs_clear_sb_rdonly(struct super_block *sb) { sb->s_flags &= ~SB_RDONLY; clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); } /* root-item.c */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, const char *name, int name_len); int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, const char *name, int name_len); int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key); int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_root_item *item); int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item); int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, struct btrfs_path *path, struct btrfs_root_item *root_item, struct btrfs_key *root_key); int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info); void btrfs_set_root_node(struct btrfs_root_item *item, struct extent_buffer *node); void btrfs_check_and_init_root_item(struct btrfs_root_item *item); void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root); /* uuid-tree.c */ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); /* dir-item.c */ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const char *name, int name_len); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index); struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, const char *name, int name_len, int mod); struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, u64 index, const char *name, int name_len, int mod); struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, const char *name, int name_len); int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_dir_item *di); int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, const char *name, u16 name_len, const void *data, u16 data_len); struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod); struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const char *name, int name_len); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); /* inode-item.c */ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, u64 index); int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, u64 *index); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod); struct btrfs_inode_extref * btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, const char *name, int name_len); struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct extent_buffer *leaf, int slot, u64 ref_objectid, const char *name, int name_len); /* file-item.c */ struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, u64 disk_offset, u64 disk_num_bytes, u64 num_bytes, u64 offset, u64 ram_bytes, u8 compression, u8 encryption, u16 other_encoding); int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, u64 bytenr, int mod); int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, const bool new_inline, struct extent_map *em); int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size); u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, struct page *page, u64 start, u64 end); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool strict); void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, const char *name, int name_len); int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const char *name, int name_len, int add_backref, u64 index); int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int front); int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, u64 new_size, u32 min_type, u64 *extents_found); int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, struct btrfs_root *parent_root, struct user_namespace *mnt_userns); void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, unsigned *bits); void btrfs_clear_delalloc_extent(struct inode *inode, struct extent_state *state, unsigned *bits); void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, struct extent_state *other); void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, unsigned long bio_flags); void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); void btrfs_free_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); struct inode *btrfs_iget_path(struct super_block *s, u64 ino, struct btrfs_root *root, struct btrfs_path *path); struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 end); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page); void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate); extern const struct dentry_operations btrfs_dentry_operations; extern const struct iomap_ops btrfs_dio_iomap_ops; extern const struct iomap_dio_ops btrfs_dio_ops; /* Inode locking type flags, by default the exclusive lock is taken */ #define BTRFS_ILOCK_SHARED (1U << 0) #define BTRFS_ILOCK_TRY (1U << 1) #define BTRFS_ILOCK_MMAP (1U << 2) int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); int btrfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int __pure btrfs_is_empty_uuid(u8 *uuid); int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_pages); void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); /* file.c */ int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, int skip_pinned); extern const struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_drop_extents_args *args); int btrfs_replace_file_extents(struct btrfs_inode *inode, struct btrfs_path *path, const u64 start, const u64 end, struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached, bool noreserve); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes); void btrfs_check_nocow_unlock(struct btrfs_inode *inode); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); /* super.c */ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); int btrfs_sync_fs(struct super_block *sb, int wait); char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid); static inline __printf(2, 3) __cold void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { } #ifdef CONFIG_PRINTK __printf(2, 3) __cold void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); #else #define btrfs_printk(fs_info, fmt, args...) \ btrfs_no_printk(fs_info, fmt, ##args) #endif #define btrfs_emerg(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_EMERG fmt, ##args) #define btrfs_alert(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_ALERT fmt, ##args) #define btrfs_crit(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_CRIT fmt, ##args) #define btrfs_err(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_ERR fmt, ##args) #define btrfs_warn(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_WARNING fmt, ##args) #define btrfs_notice(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) #define btrfs_info(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_INFO fmt, ##args) /* * Wrappers that use printk_in_rcu */ #define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) #define btrfs_alert_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) #define btrfs_crit_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) #define btrfs_err_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) #define btrfs_warn_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) #define btrfs_notice_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) #define btrfs_info_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) /* * Wrappers that use a ratelimited printk_in_rcu */ #define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) #define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) #define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) #define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) #define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) #define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) #define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) /* * Wrappers that use a ratelimited printk */ #define btrfs_emerg_rl(fs_info, fmt, args...) \ btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) #define btrfs_alert_rl(fs_info, fmt, args...) \ btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) #define btrfs_crit_rl(fs_info, fmt, args...) \ btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) #define btrfs_err_rl(fs_info, fmt, args...) \ btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) #define btrfs_warn_rl(fs_info, fmt, args...) \ btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) #define btrfs_notice_rl(fs_info, fmt, args...) \ btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) #define btrfs_info_rl(fs_info, fmt, args...) \ btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) #if defined(CONFIG_DYNAMIC_DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ _dynamic_func_call_no_desc(fmt, btrfs_printk, \ fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_in_rcu(fs_info, fmt, args...) \ _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl(fs_info, fmt, args...) \ _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \ fs_info, KERN_DEBUG fmt, ##args) #elif defined(DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl(fs_info, fmt, args...) \ btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) #else #define btrfs_debug(fs_info, fmt, args...) \ btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_in_rcu(fs_info, fmt, args...) \ btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl(fs_info, fmt, args...) \ btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) #endif #define btrfs_printk_in_rcu(fs_info, fmt, args...) \ do { \ rcu_read_lock(); \ btrfs_printk(fs_info, fmt, ##args); \ rcu_read_unlock(); \ } while (0) #define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \ do { \ rcu_read_lock(); \ btrfs_no_printk(fs_info, fmt, ##args); \ rcu_read_unlock(); \ } while (0) #define btrfs_printk_ratelimited(fs_info, fmt, args...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ if (__ratelimit(&_rs)) \ btrfs_printk(fs_info, fmt, ##args); \ } while (0) #define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ do { \ rcu_read_lock(); \ btrfs_printk_ratelimited(fs_info, fmt, ##args); \ rcu_read_unlock(); \ } while (0) #ifdef CONFIG_BTRFS_ASSERT __cold __noreturn static inline void assertfail(const char *expr, const char *file, int line) { pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); BUG(); } #define ASSERT(expr) \ (likely(expr) ? (void)0 : assertfail(#expr, __FILE__, __LINE__)) #else static inline void assertfail(const char *expr, const char* file, int line) { } #define ASSERT(expr) (void)(expr) #endif #if BITS_PER_LONG == 32 #define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT) /* * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical * addresses of extents. * * For 4K page size it's about 10T, for 64K it's 160T. */ #define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8) void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info); void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info); #endif /* * Get the correct offset inside the page of extent buffer. * * @eb: target extent buffer * @start: offset inside the extent buffer * * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases. */ static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb, unsigned long offset) { /* * For sectorsize == PAGE_SIZE case, eb->start will always be aligned * to PAGE_SIZE, thus adding it won't cause any difference. * * For sectorsize < PAGE_SIZE, we must only read the data that belongs * to the eb, thus we have to take the eb->start into consideration. */ return offset_in_page(offset + eb->start); } static inline unsigned long get_eb_page_index(unsigned long offset) { /* * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough. * * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE, * and have ensured that all tree blocks are contained in one page, * thus we always get index == 0. */ return offset >> PAGE_SHIFT; } /* * Use that for functions that are conditionally exported for sanity tests but * otherwise static */ #ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS #define EXPORT_FOR_TESTS static #else #define EXPORT_FOR_TESTS #endif __cold static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info) { btrfs_err(fs_info, "Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); } __printf(5, 6) __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); const char * __attribute_const__ btrfs_decode_error(int errno); __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, unsigned int line, int errno); /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. */ #define btrfs_abort_transaction(trans, errno) \ do { \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ if ((errno) != -EIO && (errno) != -EROFS) { \ WARN(1, KERN_DEBUG \ "BTRFS: Transaction aborted (error %d)\n", \ (errno)); \ } else { \ btrfs_debug((trans)->fs_info, \ "Transaction aborted (error %d)", \ (errno)); \ } \ } \ __btrfs_abort_transaction((trans), __func__, \ __LINE__, (errno)); \ } while (0) #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ do { \ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ (errno), fmt, ##args); \ } while (0) #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) __printf(5, 6) __cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); /* * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic * will panic(). Otherwise we BUG() here. */ #define btrfs_panic(fs_info, errno, fmt, args...) \ do { \ __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ BUG(); \ } while (0) /* compatibility and incompatibility defines */ #define btrfs_set_fs_incompat(__fs_info, opt) \ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ #opt) static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char* name) { struct btrfs_super_block *disk_super; u64 features; disk_super = fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (!(features & flag)) { spin_lock(&fs_info->super_lock); features = btrfs_super_incompat_flags(disk_super); if (!(features & flag)) { features |= flag; btrfs_set_super_incompat_flags(disk_super, features); btrfs_info(fs_info, "setting incompat feature flag for %s (0x%llx)", name, flag); } spin_unlock(&fs_info->super_lock); } } #define btrfs_clear_fs_incompat(__fs_info, opt) \ __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ #opt) static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char* name) { struct btrfs_super_block *disk_super; u64 features; disk_super = fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (features & flag) { spin_lock(&fs_info->super_lock); features = btrfs_super_incompat_flags(disk_super); if (features & flag) { features &= ~flag; btrfs_set_super_incompat_flags(disk_super, features); btrfs_info(fs_info, "clearing incompat feature flag for %s (0x%llx)", name, flag); } spin_unlock(&fs_info->super_lock); } } #define btrfs_fs_incompat(fs_info, opt) \ __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) { struct btrfs_super_block *disk_super; disk_super = fs_info->super_copy; return !!(btrfs_super_incompat_flags(disk_super) & flag); } #define btrfs_set_fs_compat_ro(__fs_info, opt) \ __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ #opt) static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, const char *name) { struct btrfs_super_block *disk_super; u64 features; disk_super = fs_info->super_copy; features = btrfs_super_compat_ro_flags(disk_super); if (!(features & flag)) { spin_lock(&fs_info->super_lock); features = btrfs_super_compat_ro_flags(disk_super); if (!(features & flag)) { features |= flag; btrfs_set_super_compat_ro_flags(disk_super, features); btrfs_info(fs_info, "setting compat-ro feature flag for %s (0x%llx)", name, flag); } spin_unlock(&fs_info->super_lock); } } #define btrfs_clear_fs_compat_ro(__fs_info, opt) \ __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ #opt) static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, const char *name) { struct btrfs_super_block *disk_super; u64 features; disk_super = fs_info->super_copy; features = btrfs_super_compat_ro_flags(disk_super); if (features & flag) { spin_lock(&fs_info->super_lock); features = btrfs_super_compat_ro_flags(disk_super); if (features & flag) { features &= ~flag; btrfs_set_super_compat_ro_flags(disk_super, features); btrfs_info(fs_info, "clearing compat-ro feature flag for %s (0x%llx)", name, flag); } spin_unlock(&fs_info->super_lock); } } #define btrfs_fs_compat_ro(fs_info, opt) \ __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) { struct btrfs_super_block *disk_super; disk_super = fs_info->super_copy; return !!(btrfs_super_compat_ro_flags(disk_super) & flag); } /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir); #else #define btrfs_get_acl NULL #define btrfs_set_acl NULL static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir) { return 0; } #endif /* relocation.c */ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_recover_relocation(struct btrfs_root *root); int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *cow); void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_should_ignore_reloc_root(struct btrfs_root *root); /* scrub.c */ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly, int is_dev_replace); void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); int btrfs_scrub_cancel(struct btrfs_fs_info *info); int btrfs_scrub_cancel_dev(struct btrfs_device *dev); int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress); static inline void btrfs_init_full_stripe_locks_tree( struct btrfs_full_stripe_locks_tree *locks_root) { locks_root->root = RB_ROOT; mutex_init(&locks_root->lock); } /* dev-replace.c */ void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) { btrfs_bio_counter_sub(fs_info, 1); } /* reada.c */ struct reada_control { struct btrfs_fs_info *fs_info; /* tree to prefetch */ struct btrfs_key key_start; struct btrfs_key key_end; /* exclusive */ atomic_t elems; struct kref refcnt; wait_queue_head_t wait; }; struct reada_control *btrfs_reada_add(struct btrfs_root *root, struct btrfs_key *start, struct btrfs_key *end); int btrfs_reada_wait(void *handle); void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct extent_buffer *eb, int err); void btrfs_reada_remove_dev(struct btrfs_device *dev); void btrfs_reada_undo_remove_dev(struct btrfs_device *dev); static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID && !btrfs_qgroup_level(rootid))) return 1; return 0; } static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) { return signal_pending(current); } /* verity.c */ #ifdef CONFIG_FS_VERITY extern const struct fsverity_operations btrfs_verityops; int btrfs_drop_verity_items(struct btrfs_inode *inode); BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, encryption, 8); BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, size, 64); BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, struct btrfs_verity_descriptor_item, encryption, 8); BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, struct btrfs_verity_descriptor_item, size, 64); #else static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) { return 0; } #endif /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode); static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); } #else static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { return 0; } #endif static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) { return fs_info->zoned != 0; } /* * Count how many fs_info->max_extent_size cover the @size */ static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) { #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (!fs_info) return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); #endif return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); } static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) { return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; } /* * We use page status Private2 to indicate there is an ordered extent with * unfinished IO. * * Rename the Private2 accessors to Ordered, to improve readability. */ #define PageOrdered(page) PagePrivate2(page) #define SetPageOrdered(page) SetPagePrivate2(page) #define ClearPageOrdered(page) ClearPagePrivate2(page) #endif |
39 8 71 21 34 43 9 10 3 16 4 1 23 19 4 1 3 2 1 1 1 3 13 3 1 2 2 3 2 8 2 2 26 2 257 29 4 246 255 258 258 1093 5 1760 14 3 7736 12 550 547 104 440 5 5 455 6 5 445 445 133 365 365 170 170 170 1165 3 2 1156 1157 611 550 423 771 4549 4535 1341 3307 23 23 23 2 7 13 21 16 16 16 5442 2 1 5438 5434 923 4549 1343 4268 963 146 145 814 288 675 946 962 5291 392 1046 4245 1927 3461 5102 5295 1 27 2 217 218 226 226 1 49 2 187 185 196 196 1346 1355 1351 1344 439 884 5 362 260 826 1 2 24 811 297 507 671 631 3 3 3 3 24 24 1388 2 6 1382 1058 321 345 963 307 305 2 474 2 2 318 2 340 26 130 209 2 191 145 325 162 175 785 72 471 313 2 620 180 775 283 546 487 1 66 2 476 475 208 278 328 1 78 2 320 316 166 178 341 789 416 69 69 79 251 251 471 76 2 383 86 468 19 2 457 446 1 8 2 426 28 160 263 264 39 226 421 430 474 1 84 386 7 3 63 2 60 29 28 68 68 55 52 5 1 19 32 28 43 43 84 19 7 11 63 30 41 2 27 4 39 20 23 73 77 42 19 10 1589 3 42 1607 30 1393 212 1 1578 1572 37 4 4 2 4 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/read_write.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/slab.h> #include <linux/stat.h> #include <linux/sched/xacct.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/fs.h> #include "internal.h" #include <linux/uaccess.h> #include <asm/unistd.h> const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .mmap = generic_file_readonly_mmap, .splice_read = generic_file_splice_read, }; EXPORT_SYMBOL(generic_ro_fops); static inline bool unsigned_offsets(struct file *file) { return file->f_mode & FMODE_UNSIGNED_OFFSET; } /** * vfs_setpos - update the file offset for lseek * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * * This is a low-level filesystem helper for updating the file offset to * the value specified by @offset if the given offset is valid and it is * not equal to the current file offset. * * Return the specified offset on success and -EINVAL on invalid offset. */ loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) { if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; if (offset > maxsize) return -EINVAL; if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; } return offset; } EXPORT_SYMBOL(vfs_setpos); /** * generic_file_llseek_size - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: max size of this file in file system * @eof: offset used for SEEK_END position * * This is a variant of generic_file_llseek that allows passing in a custom * maximum file size and a custom EOF position, for e.g. hashed directories * * Synchronization: * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. * read/writes behave like SEEK_SET against seeks. */ loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof) { switch (whence) { case SEEK_END: offset += eof; break; case SEEK_CUR: /* * Here we special-case the lseek(fd, 0, SEEK_CUR) * position-querying operation. Avoid rewriting the "same" * f_pos value back to the file because a concurrent read(), * write() or lseek() might have altered it */ if (offset == 0) return file->f_pos; /* * f_lock protects against read/modify/write race with other * SEEK_CURs. Note that parallel writes and reads behave * like SEEK_SET. */ spin_lock(&file->f_lock); offset = vfs_setpos(file, file->f_pos + offset, maxsize); spin_unlock(&file->f_lock); return offset; case SEEK_DATA: /* * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ if ((unsigned long long)offset >= eof) return -ENXIO; break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ if ((unsigned long long)offset >= eof) return -ENXIO; offset = eof; break; } return vfs_setpos(file, offset, maxsize); } EXPORT_SYMBOL(generic_file_llseek_size); /** * generic_file_llseek - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is a generic implemenation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by * @offset and @whence. */ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; return generic_file_llseek_size(file, offset, whence, inode->i_sb->s_maxbytes, i_size_read(inode)); } EXPORT_SYMBOL(generic_file_llseek); /** * fixed_size_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: size of the file * */ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: return generic_file_llseek_size(file, offset, whence, size, size); default: return -EINVAL; } } EXPORT_SYMBOL(fixed_size_llseek); /** * no_seek_end_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * */ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, OFFSET_MAX, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek); /** * no_seek_end_llseek_size - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: maximal offset allowed * */ loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, size, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek_size); /** * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is an implementation of ->llseek useable for the rare special case when * userspace expects the seek to succeed but the (device) file is actually not * able to perform the seek. In this case you use noop_llseek() instead of * falling back to the default implementation of ->llseek. */ loff_t noop_llseek(struct file *file, loff_t offset, int whence) { return file->f_pos; } EXPORT_SYMBOL(noop_llseek); loff_t no_llseek(struct file *file, loff_t offset, int whence) { return -ESPIPE; } EXPORT_SYMBOL(no_llseek); loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); loff_t retval; inode_lock(inode); switch (whence) { case SEEK_END: offset += i_size_read(inode); break; case SEEK_CUR: if (offset == 0) { retval = file->f_pos; goto out; } offset += file->f_pos; break; case SEEK_DATA: /* * In the generic case the entire file is data, so as * long as offset isn't at the end of the file then the * offset is data. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so * as long as offset isn't i_size or larger, return * i_size. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } offset = inode->i_size; break; } retval = -EINVAL; if (offset >= 0 || unsigned_offsets(file)) { if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; } retval = offset; } out: inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); loff_t vfs_llseek(struct file *file, loff_t offset, int whence) { loff_t (*fn)(struct file *, loff_t, int); fn = no_llseek; if (file->f_mode & FMODE_LSEEK) { if (file->f_op->llseek) fn = file->f_op->llseek; } return fn(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) { off_t retval; struct fd f = fdget_pos(fd); if (!f.file) return -EBADF; retval = -EINVAL; if (whence <= SEEK_MAX) { loff_t res = vfs_llseek(f.file, offset, whence); retval = res; if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } fdput_pos(f); return retval; } SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #endif #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ defined(__ARCH_WANT_SYS_LLSEEK) SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, unsigned int, whence) { int retval; struct fd f = fdget_pos(fd); loff_t offset; if (!f.file) return -EBADF; retval = -EINVAL; if (whence > SEEK_MAX) goto out_putf; offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, whence); retval = (int)offset; if (offset >= 0) { retval = -EFAULT; if (!copy_to_user(result, &offset, sizeof(offset))) retval = 0; } out_putf: fdput_pos(f); return retval; } #endif int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { if (unlikely((ssize_t) count < 0)) return -EINVAL; /* * ranged mandatory locking does not apply to streams - it makes sense * only for files where position has a meaning. */ if (ppos) { loff_t pos = *ppos; if (unlikely(pos < 0)) { if (!unsigned_offsets(file)) return -EINVAL; if (count >= -pos) /* both values are in 0..LLONG_MAX */ return -EOVERFLOW; } else if (unlikely((loff_t) (pos + count) < 0)) { if (!unsigned_offsets(file)) return -EINVAL; } } return security_file_permission(file, read_write == READ ? MAY_READ : MAY_WRITE); } static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = len }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_init(&iter, READ, &iov, 1, len); ret = call_read_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } static int warn_unsupported(struct file *file, const char *op) { pr_warn_ratelimited( "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) return -EINVAL; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; /* * Also fail if ->read_iter and ->read are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->read_iter || file->f_op->read)) return warn_unsupported(file, "read"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len); ret = file->f_op->read_iter(&kiocb, &iter); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; return __kernel_read(file, buf, count, pos); } EXPORT_SYMBOL(kernel_read); ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); else if (file->f_op->read_iter) ret = new_sync_read(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_init(&iter, WRITE, &iov, 1, len); ret = call_write_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0 && ppos) *ppos = kiocb.ki_pos; return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = (void *)buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; /* * Also fail if ->write_iter and ->write are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->write_iter || file->f_op->write)) return warn_unsupported(file, "write"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len); ret = file->f_op->write_iter(&kiocb, &iter); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); return ret; } /* * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", * but autofs is one of the few internal kernel users that actually * wants this _and_ can be built as a module. So we need to export * this symbol for autofs, even though it really isn't appropriate * for any other kernel modules. */ EXPORT_SYMBOL_GPL(__kernel_write); ssize_t kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; file_start_write(file); ret = __kernel_write(file, buf, count, pos); file_end_write(file); return ret; } EXPORT_SYMBOL(kernel_write); ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else if (file->f_op->write_iter) ret = new_sync_write(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); file_end_write(file); return ret; } /* file_ppos returns &file->f_pos or NULL if file is stream */ static inline loff_t *file_ppos(struct file *file) { return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_read(f.file, buf, count, ppos); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } return ret; } SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { return ksys_read(fd, buf, count); } ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_write(f.file, buf, count, ppos); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } return ret; } SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { return ksys_write(fd, buf, count); } ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) ret = vfs_read(f.file, buf, count, &pos); fdput(f); } return ret; } SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, size_t, count, loff_t, pos) { return ksys_pread64(fd, buf, count, pos); } ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PWRITE) ret = vfs_write(f.file, buf, count, &pos); fdput(f); } return ret; } SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, loff_t, pos) { return ksys_pwrite64(fd, buf, count, pos); } static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { struct kiocb kiocb; ssize_t ret; init_sync_kiocb(&kiocb, filp); ret = kiocb_set_rw_flags(&kiocb, flags); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); if (type == READ) ret = call_read_iter(filp, &kiocb, iter); else ret = call_write_iter(filp, &kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { ssize_t ret = 0; if (flags & ~RWF_HIPRI) return -EOPNOTSUPP; while (iov_iter_count(iter)) { struct iovec iovec = iov_iter_iovec(iter); ssize_t nr; if (type == READ) { nr = filp->f_op->read(filp, iovec.iov_base, iovec.iov_len, ppos); } else { nr = filp->f_op->write(filp, iovec.iov_base, iovec.iov_len, ppos); } if (nr < 0) { if (!ret) ret = nr; break; } ret += nr; if (nr != iovec.iov_len) break; iov_iter_advance(iter, nr); } return ret; } static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, loff_t *pos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, pos, tot_len); if (ret < 0) return ret; if (file->f_op->read_iter) ret = do_iter_readv_writev(file, iter, pos, READ, flags); else ret = do_loop_readv_writev(file, iter, pos, READ, flags); out: if (ret >= 0) fsnotify_access(file); return ret; } ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; ret = call_read_iter(file, iocb, iter); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_read); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { if (!file->f_op->read_iter) return -EINVAL; return do_iter_read(file, iter, ppos, flags); } EXPORT_SYMBOL(vfs_iter_read); static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, loff_t *pos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, pos, tot_len); if (ret < 0) return ret; if (file->f_op->write_iter) ret = do_iter_readv_writev(file, iter, pos, WRITE, flags); else ret = do_loop_readv_writev(file, iter, pos, WRITE, flags); if (ret > 0) fsnotify_modify(file); return ret; } ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->write_iter) return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; ret = call_write_iter(file, iocb, iter); if (ret > 0) fsnotify_modify(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_write); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { if (!file->f_op->write_iter) return -EINVAL; return do_iter_write(file, iter, ppos, flags); } EXPORT_SYMBOL(vfs_iter_write); static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; ssize_t ret; ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret >= 0) { ret = do_iter_read(file, &iter, pos, flags); kfree(iov); } return ret; } static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; ssize_t ret; ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret >= 0) { file_start_write(file); ret = do_iter_write(file, &iter, pos, flags); file_end_write(file); kfree(iov); } return ret; } static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_readv(f.file, vec, vlen, ppos, flags); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_writev(f.file, vec, vlen, ppos, flags); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) { #define HALF_LONG_BITS (BITS_PER_LONG / 2) return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; } static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) ret = vfs_readv(f.file, vec, vlen, &pos, flags); fdput(f); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PWRITE) ret = vfs_writev(f.file, vec, vlen, &pos, flags); fdput(f); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_readv(fd, vec, vlen, 0); } SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_writev(fd, vec, vlen, 0); } SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_preadv(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_pwritev(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } /* * Various compat syscalls. Note that they all pretend to take a native * iovec - import_iovec will properly treat those as compat_iovecs based on * in_compat_syscall(). */ #ifdef CONFIG_COMPAT #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_preadv(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_preadv(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_pwritev(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_pwritev(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { struct fd in, out; struct inode *in_inode, *out_inode; struct pipe_inode_info *opipe; loff_t pos; loff_t out_pos; ssize_t retval; int fl; /* * Get input file, and verify that it is ok.. */ retval = -EBADF; in = fdget(in_fd); if (!in.file) goto out; if (!(in.file->f_mode & FMODE_READ)) goto fput_in; retval = -ESPIPE; if (!ppos) { pos = in.file->f_pos; } else { pos = *ppos; if (!(in.file->f_mode & FMODE_PREAD)) goto fput_in; } retval = rw_verify_area(READ, in.file, &pos, count); if (retval < 0) goto fput_in; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; /* * Get output file, and verify that it is ok.. */ retval = -EBADF; out = fdget(out_fd); if (!out.file) goto fput_in; if (!(out.file->f_mode & FMODE_WRITE)) goto fput_out; in_inode = file_inode(in.file); out_inode = file_inode(out.file); out_pos = out.file->f_pos; if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); if (unlikely(pos + count > max)) { retval = -EOVERFLOW; if (pos >= max) goto fput_out; count = max - pos; } fl = 0; #if 0 /* * We need to debate whether we can enable this or not. The * man page documents EAGAIN return for the output at least, * and the application is arguably buggy if it doesn't expect * EAGAIN on a non-blocking file descriptor. */ if (in.file->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif opipe = get_pipe_info(out.file, true); if (!opipe) { retval = rw_verify_area(WRITE, out.file, &out_pos, count); if (retval < 0) goto fput_out; file_start_write(out.file); retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); file_end_write(out.file); } else { if (out.file->f_flags & O_NONBLOCK) fl |= SPLICE_F_NONBLOCK; retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl); } if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); fsnotify_access(in.file); fsnotify_modify(out.file); out.file->f_pos = out_pos; if (ppos) *ppos = pos; else in.file->f_pos = pos; } inc_syscr(current); inc_syscw(current); if (pos > max) retval = -EOVERFLOW; fput_out: fdput(out); fput_in: fdput(in); out: return retval; } SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, compat_off_t __user *, offset, compat_size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, compat_loff_t __user *, offset, compat_size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #endif /** * generic_copy_file_range - copy data between two files * @file_in: file structure to read from * @pos_in: file offset to read from * @file_out: file structure to write data to * @pos_out: file offset to write data to * @len: amount of data to copy * @flags: copy flags * * This is a generic filesystem helper to copy data from one file to another. * It has no constraints on the source or destination file owners - the files * can belong to different superblocks and different filesystem types. Short * copies are allowed. * * This should be called from the @file_out filesystem, as per the * ->copy_file_range() method. * * Returns the number of bytes copied or a negative error indicating the * failure. */ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { return do_splice_direct(file_in, &pos_in, file_out, &pos_out, len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); } EXPORT_SYMBOL(generic_copy_file_range); /* * Performs necessary checks before doing a file copy * * Can adjust amount of bytes to copy via @req_count argument. * Returns appropriate error code that caller should return or * zero in case the copy should be allowed. */ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t *req_count, unsigned int flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); uint64_t count = *req_count; loff_t size_in; int ret; ret = generic_file_rw_checks(file_in, file_out); if (ret) return ret; /* * We allow some filesystems to handle cross sb copy, but passing * a file of the wrong filesystem type to filesystem driver can result * in an attempt to dereference the wrong type of ->private_data, so * avoid doing that until we really have a good reason. * * nfs and cifs define several different file_system_type structures * and several different sets of file_operations, but they all end up * using the same ->copy_file_range() function pointer. */ if (flags & COPY_FILE_SPLICE) { /* cross sb splice is allowed */ } else if (file_out->f_op->copy_file_range) { if (file_in->f_op->copy_file_range != file_out->f_op->copy_file_range) return -EXDEV; } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { return -EXDEV; } /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode_out)) return -EPERM; if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) return -ETXTBSY; /* Ensure offsets don't wrap. */ if (pos_in + count < pos_in || pos_out + count < pos_out) return -EOVERFLOW; /* Shorten the copy to EOF */ size_in = i_size_read(inode_in); if (pos_in >= size_in) count = 0; else count = min(count, size_in - (uint64_t)pos_in); ret = generic_write_check_limits(file_out, pos_out, &count); if (ret) return ret; /* Don't allow overlapped copying within the same file. */ if (inode_in == inode_out && pos_out + count > pos_in && pos_out < pos_in + count) return -EINVAL; *req_count = count; return 0; } /* * copy_file_range() differs from regular file read and write in that it * specifically allows return partial success. When it does so is up to * the copy_file_range method. */ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { ssize_t ret; bool splice = flags & COPY_FILE_SPLICE; if (flags & ~COPY_FILE_SPLICE) return -EINVAL; ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, flags); if (unlikely(ret)) return ret; ret = rw_verify_area(READ, file_in, &pos_in, len); if (unlikely(ret)) return ret; ret = rw_verify_area(WRITE, file_out, &pos_out, len); if (unlikely(ret)) return ret; if (len == 0) return 0; file_start_write(file_out); /* * Cloning is supported by more file systems, so we implement copy on * same sb using clone, but for filesystems where both clone and copy * are supported (e.g. nfs,cifs), we only call the copy method. */ if (!splice && file_out->f_op->copy_file_range) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); goto done; } if (!splice && file_in->f_op->remap_file_range && file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, min_t(loff_t, MAX_RW_COUNT, len), REMAP_FILE_CAN_SHORTEN); if (ret > 0) goto done; } /* * We can get here for same sb copy of filesystems that do not implement * ->copy_file_range() in case filesystem does not support clone or in * case filesystem supports clone but rejected the clone request (e.g. * because it was not block aligned). * * In both cases, fall back to kernel copy so we are able to maintain a * consistent story about which filesystems support copy_file_range() * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). * * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE. */ ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); done: if (ret > 0) { fsnotify_access(file_in); add_rchar(current, ret); fsnotify_modify(file_out); add_wchar(current, ret); } inc_syscr(current); inc_syscw(current); file_end_write(file_out); return ret; } EXPORT_SYMBOL(vfs_copy_file_range); SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { loff_t pos_in; loff_t pos_out; struct fd f_in; struct fd f_out; ssize_t ret = -EBADF; f_in = fdget(fd_in); if (!f_in.file) goto out2; f_out = fdget(fd_out); if (!f_out.file) goto out1; ret = -EFAULT; if (off_in) { if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) goto out; } else { pos_in = f_in.file->f_pos; } if (off_out) { if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) goto out; } else { pos_out = f_out.file->f_pos; } ret = -EINVAL; if (flags != 0) goto out; ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len, flags); if (ret > 0) { pos_in += ret; pos_out += ret; if (off_in) { if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) ret = -EFAULT; } else { f_in.file->f_pos = pos_in; } if (off_out) { if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) ret = -EFAULT; } else { f_out.file->f_pos = pos_out; } } out: fdput(f_out); out1: fdput(f_in); out2: return ret; } /* * Don't operate on ranges the page cache doesn't support, and don't exceed the * LFS limits. If pos is under the limit it becomes a short access. If it * exceeds the limit we return -EFBIG. */ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) { struct inode *inode = file->f_mapping->host; loff_t max_size = inode->i_sb->s_maxbytes; loff_t limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY) { if (pos >= limit) { send_sig(SIGXFSZ, current, 0); return -EFBIG; } *count = min(*count, limit - pos); } if (!(file->f_flags & O_LARGEFILE)) max_size = MAX_NON_LFS; if (unlikely(pos >= max_size)) return -EFBIG; *count = min(*count, max_size - pos); return 0; } /* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. * Returns appropriate error code that caller should return or * zero in case that write should be allowed. */ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; loff_t count; int ret; if (IS_SWAPFILE(inode)) return -ETXTBSY; if (!iov_iter_count(from)) return 0; /* FIXME: this is for backwards compatibility with 2.4 */ if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) return -EINVAL; count = iov_iter_count(from); ret = generic_write_check_limits(file, iocb->ki_pos, &count); if (ret) return ret; iov_iter_truncate(from, count); return iov_iter_count(from); } EXPORT_SYMBOL(generic_write_checks); /* * Performs common checks before doing a file copy/clone * from @file_in to @file_out. */ int generic_file_rw_checks(struct file *file_in, struct file *file_out) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); /* Don't copy dirs, pipes, sockets... */ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; if (!(file_in->f_mode & FMODE_READ) || !(file_out->f_mode & FMODE_WRITE) || (file_out->f_flags & O_APPEND)) return -EBADF; return 0; } |
85 86 86 4 4 4 1 2 1 4 14 14 10 4 14 13 14 13 90 90 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2008-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH */ #include <linux/export.h> #include <linux/etherdevice.h> #include <net/mac80211.h> #include <asm/unaligned.h> #include "ieee80211_i.h" #include "rate.h" #include "mesh.h" #include "led.h" #include "wme.h" void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int tmp; skb->pkt_type = IEEE80211_TX_STATUS_MSG; skb_queue_tail(info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS ? &local->skb_queue : &local->skb_queue_unreliable, skb); tmp = skb_queue_len(&local->skb_queue) + skb_queue_len(&local->skb_queue_unreliable); while (tmp > IEEE80211_IRQSAFE_QUEUE_LIMIT && (skb = skb_dequeue(&local->skb_queue_unreliable))) { ieee80211_free_txskb(hw, skb); tmp--; I802_DEBUG_INC(local->tx_status_drop); } tasklet_schedule(&local->tasklet); } EXPORT_SYMBOL(ieee80211_tx_status_irqsafe); static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; int ac; if (info->flags & (IEEE80211_TX_CTL_NO_PS_BUFFER | IEEE80211_TX_CTL_AMPDU | IEEE80211_TX_CTL_HW_80211_ENCAP)) { ieee80211_free_txskb(&local->hw, skb); return; } /* * This skb 'survived' a round-trip through the driver, and * hopefully the driver didn't mangle it too badly. However, * we can definitely not rely on the control information * being correct. Clear it so we don't get junk there, and * indicate that it needs new processing, but must not be * modified/encrypted again. */ memset(&info->control, 0, sizeof(info->control)); info->control.jiffies = jiffies; info->control.vif = &sta->sdata->vif; info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; info->flags |= IEEE80211_TX_INTFL_RETRANSMISSION; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; sta->status_stats.filtered++; /* * Clear more-data bit on filtered frames, it might be set * but later frames might time out so it might have to be * clear again ... It's all rather unlikely (this frame * should time out first, right?) but let's not confuse * peers unnecessarily. */ if (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_MOREDATA)) hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_MOREDATA); if (ieee80211_is_data_qos(hdr->frame_control)) { u8 *p = ieee80211_get_qos_ctl(hdr); int tid = *p & IEEE80211_QOS_CTL_TID_MASK; /* * Clear EOSP if set, this could happen e.g. * if an absence period (us being a P2P GO) * shortens the SP. */ if (*p & IEEE80211_QOS_CTL_EOSP) *p &= ~IEEE80211_QOS_CTL_EOSP; ac = ieee80211_ac_from_tid(tid); } else { ac = IEEE80211_AC_BE; } /* * Clear the TX filter mask for this STA when sending the next * packet. If the STA went to power save mode, this will happen * when it wakes up for the next time. */ set_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT); ieee80211_clear_fast_xmit(sta); /* * This code races in the following way: * * (1) STA sends frame indicating it will go to sleep and does so * (2) hardware/firmware adds STA to filter list, passes frame up * (3) hardware/firmware processes TX fifo and suppresses a frame * (4) we get TX status before having processed the frame and * knowing that the STA has gone to sleep. * * This is actually quite unlikely even when both those events are * processed from interrupts coming in quickly after one another or * even at the same time because we queue both TX status events and * RX frames to be processed by a tasklet and process them in the * same order that they were received or TX status last. Hence, there * is no race as long as the frame RX is processed before the next TX * status, which drivers can ensure, see below. * * Note that this can only happen if the hardware or firmware can * actually add STAs to the filter list, if this is done by the * driver in response to set_tim() (which will only reduce the race * this whole filtering tries to solve, not completely solve it) * this situation cannot happen. * * To completely solve this race drivers need to make sure that they * (a) don't mix the irq-safe/not irq-safe TX status/RX processing * functions and * (b) always process RX events before TX status events if ordering * can be unknown, for example with different interrupt status * bits. * (c) if PS mode transitions are manual (i.e. the flag * %IEEE80211_HW_AP_LINK_PS is set), always process PS state * changes before calling TX status events if ordering can be * unknown. */ if (test_sta_flag(sta, WLAN_STA_PS_STA) && skb_queue_len(&sta->tx_filtered[ac]) < STA_MAX_TX_BUFFER) { skb_queue_tail(&sta->tx_filtered[ac], skb); sta_info_recalc_tim(sta); if (!timer_pending(&local->sta_cleanup)) mod_timer(&local->sta_cleanup, round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); return; } if (!test_sta_flag(sta, WLAN_STA_PS_STA) && !(info->flags & IEEE80211_TX_INTFL_RETRIED)) { /* Software retry the packet once */ info->flags |= IEEE80211_TX_INTFL_RETRIED; ieee80211_add_pending_skb(local, skb); return; } ps_dbg_ratelimited(sta->sdata, "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n", skb_queue_len(&sta->tx_filtered[ac]), !!test_sta_flag(sta, WLAN_STA_PS_STA), jiffies); ieee80211_free_txskb(&local->hw, skb); } static void ieee80211_check_pending_bar(struct sta_info *sta, u8 *addr, u8 tid) { struct tid_ampdu_tx *tid_tx; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx || !tid_tx->bar_pending) return; tid_tx->bar_pending = false; ieee80211_send_bar(&sta->sdata->vif, addr, tid, tid_tx->failed_bar_ssn); } static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *) skb->data; struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; if (ieee80211_is_data_qos(mgmt->frame_control)) { struct ieee80211_hdr *hdr = (void *) skb->data; u8 *qc = ieee80211_get_qos_ctl(hdr); u16 tid = qc[0] & 0xf; ieee80211_check_pending_bar(sta, hdr->addr1, tid); } if (ieee80211_is_action(mgmt->frame_control) && !ieee80211_has_protected(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_HT && mgmt->u.action.u.ht_smps.action == WLAN_HT_ACTION_SMPS && ieee80211_sdata_running(sdata)) { enum ieee80211_smps_mode smps_mode; switch (mgmt->u.action.u.ht_smps.smps_control) { case WLAN_HT_SMPS_CONTROL_DYNAMIC: smps_mode = IEEE80211_SMPS_DYNAMIC; break; case WLAN_HT_SMPS_CONTROL_STATIC: smps_mode = IEEE80211_SMPS_STATIC; break; case WLAN_HT_SMPS_CONTROL_DISABLED: default: /* shouldn't happen since we don't send that */ smps_mode = IEEE80211_SMPS_OFF; break; } if (sdata->vif.type == NL80211_IFTYPE_STATION) { /* * This update looks racy, but isn't -- if we come * here we've definitely got a station that we're * talking to, and on a managed interface that can * only be the AP. And the only other place updating * this variable in managed mode is before association. */ sdata->smps_mode = smps_mode; ieee80211_queue_work(&local->hw, &sdata->recalc_smps); } else if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { sta->known_smps_mode = smps_mode; } } } static void ieee80211_set_bar_pending(struct sta_info *sta, u8 tid, u16 ssn) { struct tid_ampdu_tx *tid_tx; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx) return; tid_tx->failed_bar_ssn = ssn; tid_tx->bar_pending = true; } static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info, struct ieee80211_tx_status *status) { int len = sizeof(struct ieee80211_radiotap_header); /* IEEE80211_RADIOTAP_RATE rate */ if (status && status->rate && !(status->rate->flags & (RATE_INFO_FLAGS_MCS | RATE_INFO_FLAGS_DMG | RATE_INFO_FLAGS_EDMG | RATE_INFO_FLAGS_VHT_MCS | RATE_INFO_FLAGS_HE_MCS))) len += 2; else if (info->status.rates[0].idx >= 0 && !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) len += 2; /* IEEE80211_RADIOTAP_TX_FLAGS */ len += 2; /* IEEE80211_RADIOTAP_DATA_RETRIES */ len += 1; /* IEEE80211_RADIOTAP_MCS * IEEE80211_RADIOTAP_VHT */ if (status && status->rate) { if (status->rate->flags & RATE_INFO_FLAGS_MCS) len += 3; else if (status->rate->flags & RATE_INFO_FLAGS_VHT_MCS) len = ALIGN(len, 2) + 12; else if (status->rate->flags & RATE_INFO_FLAGS_HE_MCS) len = ALIGN(len, 2) + 12; } else if (info->status.rates[0].idx >= 0) { if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) len += 3; else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) len = ALIGN(len, 2) + 12; } return len; } static void ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, struct ieee80211_supported_band *sband, struct sk_buff *skb, int retry_count, int rtap_len, int shift, struct ieee80211_tx_status *status) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_radiotap_header *rthdr; unsigned char *pos; u16 legacy_rate = 0; u16 txflags; rthdr = skb_push(skb, rtap_len); memset(rthdr, 0, rtap_len); rthdr->it_len = cpu_to_le16(rtap_len); rthdr->it_present = cpu_to_le32(BIT(IEEE80211_RADIOTAP_TX_FLAGS) | BIT(IEEE80211_RADIOTAP_DATA_RETRIES)); pos = (unsigned char *)(rthdr + 1); /* * XXX: Once radiotap gets the bitmap reset thing the vendor * extensions proposal contains, we can actually report * the whole set of tries we did. */ /* IEEE80211_RADIOTAP_RATE */ if (status && status->rate) { if (!(status->rate->flags & (RATE_INFO_FLAGS_MCS | RATE_INFO_FLAGS_DMG | RATE_INFO_FLAGS_EDMG | RATE_INFO_FLAGS_VHT_MCS | RATE_INFO_FLAGS_HE_MCS))) legacy_rate = status->rate->legacy; } else if (info->status.rates[0].idx >= 0 && !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) legacy_rate = sband->bitrates[info->status.rates[0].idx].bitrate; if (legacy_rate) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_RATE)); *pos = DIV_ROUND_UP(legacy_rate, 5 * (1 << shift)); /* padding for tx flags */ pos += 2; } /* IEEE80211_RADIOTAP_TX_FLAGS */ txflags = 0; if (!(info->flags & IEEE80211_TX_STAT_ACK) && !is_multicast_ether_addr(hdr->addr1)) txflags |= IEEE80211_RADIOTAP_F_TX_FAIL; if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT) txflags |= IEEE80211_RADIOTAP_F_TX_CTS; if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) txflags |= IEEE80211_RADIOTAP_F_TX_RTS; put_unaligned_le16(txflags, pos); pos += 2; /* IEEE80211_RADIOTAP_DATA_RETRIES */ /* for now report the total retry_count */ *pos = retry_count; pos++; if (status && status->rate && (status->rate->flags & RATE_INFO_FLAGS_MCS)) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | IEEE80211_RADIOTAP_MCS_HAVE_GI | IEEE80211_RADIOTAP_MCS_HAVE_BW; if (status->rate->flags & RATE_INFO_FLAGS_SHORT_GI) pos[1] |= IEEE80211_RADIOTAP_MCS_SGI; if (status->rate->bw == RATE_INFO_BW_40) pos[1] |= IEEE80211_RADIOTAP_MCS_BW_40; pos[2] = status->rate->mcs; pos += 3; } else if (status && status->rate && (status->rate->flags & RATE_INFO_FLAGS_VHT_MCS)) { u16 known = local->hw.radiotap_vht_details & (IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); /* u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_* */ put_unaligned_le16(known, pos); pos += 2; /* u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_* */ if (status->rate->flags & RATE_INFO_FLAGS_SHORT_GI) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; pos++; /* u8 bandwidth */ switch (status->rate->bw) { case RATE_INFO_BW_160: *pos = 11; break; case RATE_INFO_BW_80: *pos = 4; break; case RATE_INFO_BW_40: *pos = 1; break; default: *pos = 0; break; } pos++; /* u8 mcs_nss[4] */ *pos = (status->rate->mcs << 4) | status->rate->nss; pos += 4; /* u8 coding */ pos++; /* u8 group_id */ pos++; /* u16 partial_aid */ pos += 2; } else if (status && status->rate && (status->rate->flags & RATE_INFO_FLAGS_HE_MCS)) { struct ieee80211_radiotap_he *he; rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_HE)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); he = (struct ieee80211_radiotap_he *)pos; he->data1 = cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA1_FORMAT_SU | IEEE80211_RADIOTAP_HE_DATA1_DATA_MCS_KNOWN | IEEE80211_RADIOTAP_HE_DATA1_DATA_DCM_KNOWN | IEEE80211_RADIOTAP_HE_DATA1_BW_RU_ALLOC_KNOWN); he->data2 = cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA2_GI_KNOWN); #define HE_PREP(f, val) le16_encode_bits(val, IEEE80211_RADIOTAP_HE_##f) he->data6 |= HE_PREP(DATA6_NSTS, status->rate->nss); #define CHECK_GI(s) \ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_GI_##s != \ (int)NL80211_RATE_INFO_HE_GI_##s) CHECK_GI(0_8); CHECK_GI(1_6); CHECK_GI(3_2); he->data3 |= HE_PREP(DATA3_DATA_MCS, status->rate->mcs); he->data3 |= HE_PREP(DATA3_DATA_DCM, status->rate->he_dcm); he->data5 |= HE_PREP(DATA5_GI, status->rate->he_gi); switch (status->rate->bw) { case RATE_INFO_BW_20: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ); break; case RATE_INFO_BW_40: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ); break; case RATE_INFO_BW_80: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ); break; case RATE_INFO_BW_160: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ); break; case RATE_INFO_BW_HE_RU: #define CHECK_RU_ALLOC(s) \ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_##s##T != \ NL80211_RATE_INFO_HE_RU_ALLOC_##s + 4) CHECK_RU_ALLOC(26); CHECK_RU_ALLOC(52); CHECK_RU_ALLOC(106); CHECK_RU_ALLOC(242); CHECK_RU_ALLOC(484); CHECK_RU_ALLOC(996); CHECK_RU_ALLOC(2x996); he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, status->rate->he_ru_alloc + 4); break; default: WARN_ONCE(1, "Invalid SU BW %d\n", status->rate->bw); } pos += sizeof(struct ieee80211_radiotap_he); } if ((status && status->rate) || info->status.rates[0].idx < 0) return; /* IEEE80211_RADIOTAP_MCS * IEEE80211_RADIOTAP_VHT */ if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | IEEE80211_RADIOTAP_MCS_HAVE_GI | IEEE80211_RADIOTAP_MCS_HAVE_BW; if (info->status.rates[0].flags & IEEE80211_TX_RC_SHORT_GI) pos[1] |= IEEE80211_RADIOTAP_MCS_SGI; if (info->status.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) pos[1] |= IEEE80211_RADIOTAP_MCS_BW_40; if (info->status.rates[0].flags & IEEE80211_TX_RC_GREEN_FIELD) pos[1] |= IEEE80211_RADIOTAP_MCS_FMT_GF; pos[2] = info->status.rates[0].idx; pos += 3; } else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) { u16 known = local->hw.radiotap_vht_details & (IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); /* u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_* */ put_unaligned_le16(known, pos); pos += 2; /* u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_* */ if (info->status.rates[0].flags & IEEE80211_TX_RC_SHORT_GI) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; pos++; /* u8 bandwidth */ if (info->status.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) *pos = 1; else if (info->status.rates[0].flags & IEEE80211_TX_RC_80_MHZ_WIDTH) *pos = 4; else if (info->status.rates[0].flags & IEEE80211_TX_RC_160_MHZ_WIDTH) *pos = 11; else /* IEEE80211_TX_RC_{20_MHZ_WIDTH,FIXME:DUP_DATA} */ *pos = 0; pos++; /* u8 mcs_nss[4] */ *pos = (ieee80211_rate_get_vht_mcs(&info->status.rates[0]) << 4) | ieee80211_rate_get_vht_nss(&info->status.rates[0]); pos += 4; /* u8 coding */ pos++; /* u8 group_id */ pos++; /* u16 partial_aid */ pos += 2; } } /* * Handles the tx for TDLS teardown frames. * If the frame wasn't ACKed by the peer - it will be re-sent through the AP */ static void ieee80211_tdls_td_tx_handle(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 flags) { struct sk_buff *teardown_skb; struct sk_buff *orig_teardown_skb; bool is_teardown = false; /* Get the teardown data we need and free the lock */ spin_lock(&sdata->u.mgd.teardown_lock); teardown_skb = sdata->u.mgd.teardown_skb; orig_teardown_skb = sdata->u.mgd.orig_teardown_skb; if ((skb == orig_teardown_skb) && teardown_skb) { sdata->u.mgd.teardown_skb = NULL; sdata->u.mgd.orig_teardown_skb = NULL; is_teardown = true; } spin_unlock(&sdata->u.mgd.teardown_lock); if (is_teardown) { /* This mechanism relies on being able to get ACKs */ WARN_ON(!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)); /* Check if peer has ACKed */ if (flags & IEEE80211_TX_STAT_ACK) { dev_kfree_skb_any(teardown_skb); } else { tdls_dbg(sdata, "TDLS Resending teardown through AP\n"); ieee80211_subif_start_xmit(teardown_skb, skb->dev); } } } static struct ieee80211_sub_if_data * ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata; if (skb->dev) { list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; if (skb->dev == sdata->dev) return sdata; } return NULL; } return rcu_dereference(local->p2p_sdata); } static void ieee80211_report_ack_skb(struct ieee80211_local *local, struct ieee80211_tx_info *info, bool acked, bool dropped) { struct sk_buff *skb; unsigned long flags; spin_lock_irqsave(&local->ack_status_lock, flags); skb = idr_remove(&local->ack_status_frames, info->ack_frame_id); spin_unlock_irqrestore(&local->ack_status_lock, flags); if (!skb) return; if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) { u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie; struct ieee80211_sub_if_data *sdata; struct ieee80211_hdr *hdr = (void *)skb->data; rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); if (sdata) { if (skb->protocol == sdata->control_port_protocol || skb->protocol == cpu_to_be16(ETH_P_PREAUTH)) cfg80211_control_port_tx_status(&sdata->wdev, cookie, skb->data, skb->len, acked, GFP_ATOMIC); else if (ieee80211_is_any_nullfunc(hdr->frame_control)) cfg80211_probe_status(sdata->dev, hdr->addr1, cookie, acked, info->status.ack_signal, info->status.is_valid_ack_signal, GFP_ATOMIC); else if (ieee80211_is_mgmt(hdr->frame_control)) cfg80211_mgmt_tx_status(&sdata->wdev, cookie, skb->data, skb->len, acked, GFP_ATOMIC); else pr_warn("Unknown status report in ack skb\n"); } rcu_read_unlock(); dev_kfree_skb_any(skb); } else if (dropped) { dev_kfree_skb_any(skb); } else { /* consumes skb */ skb_complete_wifi_ack(skb, acked); } } static void ieee80211_report_used_skb(struct ieee80211_local *local, struct sk_buff *skb, bool dropped) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u16 tx_time_est = ieee80211_info_get_tx_time_est(info); struct ieee80211_hdr *hdr = (void *)skb->data; bool acked = info->flags & IEEE80211_TX_STAT_ACK; if (dropped) acked = false; if (tx_time_est) { struct sta_info *sta; rcu_read_lock(); sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); ieee80211_sta_update_pending_airtime(local, sta, skb_get_queue_mapping(skb), tx_time_est, true); rcu_read_unlock(); } if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) { struct ieee80211_sub_if_data *sdata; rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); if (!sdata) { skb->dev = NULL; } else { unsigned int hdr_size = ieee80211_hdrlen(hdr->frame_control); /* Check to see if packet is a TDLS teardown packet */ if (ieee80211_is_data(hdr->frame_control) && (ieee80211_get_tdls_action(skb, hdr_size) == WLAN_TDLS_TEARDOWN)) { ieee80211_tdls_td_tx_handle(local, sdata, skb, info->flags); } else if (ieee80211_s1g_is_twt_setup(skb)) { if (!acked) { struct sk_buff *qskb; qskb = skb_clone(skb, GFP_ATOMIC); if (qskb) { skb_queue_tail(&sdata->status_queue, qskb); ieee80211_queue_work(&local->hw, &sdata->work); } } } else { ieee80211_mgd_conn_tx_status(sdata, hdr->frame_control, acked); } } rcu_read_unlock(); } else if (info->ack_frame_id) { ieee80211_report_ack_skb(local, info, acked, dropped); } if (!dropped && skb->destructor) { skb->wifi_acked_valid = 1; skb->wifi_acked = acked; } ieee80211_led_tx(local); if (skb_has_frag_list(skb)) { kfree_skb_list(skb_shinfo(skb)->frag_list); skb_shinfo(skb)->frag_list = NULL; } } /* * Use a static threshold for now, best value to be determined * by testing ... * Should it depend on: * - on # of retransmissions * - current throughput (higher value for higher tpt)? */ #define STA_LOST_PKT_THRESHOLD 50 #define STA_LOST_PKT_TIME HZ /* 1 sec since last ACK */ #define STA_LOST_TDLS_PKT_THRESHOLD 10 #define STA_LOST_TDLS_PKT_TIME (10*HZ) /* 10secs since last ACK */ static void ieee80211_lost_packet(struct sta_info *sta, struct ieee80211_tx_info *info) { unsigned long pkt_time = STA_LOST_PKT_TIME; unsigned int pkt_thr = STA_LOST_PKT_THRESHOLD; /* If driver relies on its own algorithm for station kickout, skip * mac80211 packet loss mechanism. */ if (ieee80211_hw_check(&sta->local->hw, REPORTS_LOW_ACK)) return; /* This packet was aggregated but doesn't carry status info */ if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) return; sta->status_stats.lost_packets++; if (sta->sta.tdls) { pkt_time = STA_LOST_TDLS_PKT_TIME; pkt_thr = STA_LOST_PKT_THRESHOLD; } /* * If we're in TDLS mode, make sure that all STA_LOST_TDLS_PKT_THRESHOLD * of the last packets were lost, and that no ACK was received in the * last STA_LOST_TDLS_PKT_TIME ms, before triggering the CQM packet-loss * mechanism. * For non-TDLS, use STA_LOST_PKT_THRESHOLD and STA_LOST_PKT_TIME */ if (sta->status_stats.lost_packets < pkt_thr || !time_after(jiffies, sta->status_stats.last_pkt_time + pkt_time)) return; cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, sta->status_stats.lost_packets, GFP_ATOMIC); sta->status_stats.lost_packets = 0; } static int ieee80211_tx_get_rates(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, int *retry_count) { int count = -1; int i; for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) { /* just the first aggr frame carry status info */ info->status.rates[i].idx = -1; info->status.rates[i].count = 0; break; } else if (info->status.rates[i].idx < 0) { break; } else if (i >= hw->max_report_rates) { /* the HW cannot have attempted that rate */ info->status.rates[i].idx = -1; info->status.rates[i].count = 0; break; } count += info->status.rates[i].count; } if (count < 0) count = 0; *retry_count = count; return i - 1; } void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_supported_band *sband, int retry_count, int shift, bool send_to_cooked, struct ieee80211_tx_status *status) { struct sk_buff *skb2; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_sub_if_data *sdata; struct net_device *prev_dev = NULL; int rtap_len; /* send frame to monitor interfaces now */ rtap_len = ieee80211_tx_radiotap_len(info, status); if (WARN_ON_ONCE(skb_headroom(skb) < rtap_len)) { pr_err("ieee80211_tx_status: headroom too small\n"); dev_kfree_skb(skb); return; } ieee80211_add_tx_radiotap_header(local, sband, skb, retry_count, rtap_len, shift, status); /* XXX: is this sufficient for BPF? */ skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_OTHERHOST; skb->protocol = htons(ETH_P_802_2); memset(skb->cb, 0, sizeof(skb->cb)); rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { if (!ieee80211_sdata_running(sdata)) continue; if ((sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) && !send_to_cooked) continue; if (prev_dev) { skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) { skb2->dev = prev_dev; netif_rx(skb2); } } prev_dev = sdata->dev; } } if (prev_dev) { skb->dev = prev_dev; netif_rx(skb); skb = NULL; } rcu_read_unlock(); dev_kfree_skb(skb); } static void __ieee80211_tx_status(struct ieee80211_hw *hw, struct ieee80211_tx_status *status, int rates_idx, int retry_count) { struct sk_buff *skb = status->skb; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = status->info; struct sta_info *sta; __le16 fc; struct ieee80211_supported_band *sband; bool send_to_cooked; bool acked; bool noack_success; struct ieee80211_bar *bar; int shift = 0; int tid = IEEE80211_NUM_TIDS; sband = local->hw.wiphy->bands[info->band]; fc = hdr->frame_control; if (status->sta) { sta = container_of(status->sta, struct sta_info, sta); shift = ieee80211_vif_get_shift(&sta->sdata->vif); if (info->flags & IEEE80211_TX_STATUS_EOSP) clear_sta_flag(sta, WLAN_STA_SP); acked = !!(info->flags & IEEE80211_TX_STAT_ACK); noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED); /* mesh Peer Service Period support */ if (ieee80211_vif_is_mesh(&sta->sdata->vif) && ieee80211_is_data_qos(fc)) ieee80211_mpsp_trigger_process( ieee80211_get_qos_ctl(hdr), sta, true, acked); if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL) && (ieee80211_is_data(hdr->frame_control)) && (rates_idx != -1)) sta->tx_stats.last_rate = info->status.rates[rates_idx]; if ((info->flags & IEEE80211_TX_STAT_AMPDU_NO_BACK) && (ieee80211_is_data_qos(fc))) { u16 ssn; u8 *qc; qc = ieee80211_get_qos_ctl(hdr); tid = qc[0] & 0xf; ssn = ((le16_to_cpu(hdr->seq_ctrl) + 0x10) & IEEE80211_SCTL_SEQ); ieee80211_send_bar(&sta->sdata->vif, hdr->addr1, tid, ssn); } else if (ieee80211_is_data_qos(fc)) { u8 *qc = ieee80211_get_qos_ctl(hdr); tid = qc[0] & 0xf; } if (!acked && ieee80211_is_back_req(fc)) { u16 control; /* * BAR failed, store the last SSN and retry sending * the BAR when the next unicast transmission on the * same TID succeeds. */ bar = (struct ieee80211_bar *) skb->data; control = le16_to_cpu(bar->control); if (!(control & IEEE80211_BAR_CTRL_MULTI_TID)) { u16 ssn = le16_to_cpu(bar->start_seq_num); tid = (control & IEEE80211_BAR_CTRL_TID_INFO_MASK) >> IEEE80211_BAR_CTRL_TID_INFO_SHIFT; ieee80211_set_bar_pending(sta, tid, ssn); } } if (info->flags & IEEE80211_TX_STAT_TX_FILTERED) { ieee80211_handle_filtered_frame(local, sta, skb); return; } else if (ieee80211_is_data_present(fc)) { if (!acked && !noack_success) sta->status_stats.msdu_failed[tid]++; sta->status_stats.msdu_retries[tid] += retry_count; } if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked) ieee80211_frame_acked(sta, skb); } else if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) { struct ieee80211_sub_if_data *sdata; struct ieee80211_txq *txq; u32 airtime; /* Account airtime to multicast queue */ sdata = ieee80211_sdata_from_skb(local, skb); if (sdata && (txq = sdata->vif.txq)) { airtime = info->status.tx_time ?: ieee80211_calc_expected_tx_airtime(hw, &sdata->vif, NULL, skb->len, false); ieee80211_register_airtime(txq, airtime, 0); } } /* SNMP counters * Fragments are passed to low-level drivers as separate skbs, so these * are actually fragments, not frames. Update frame counters only for * the first fragment of the frame. */ if ((info->flags & IEEE80211_TX_STAT_ACK) || (info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED)) { if (ieee80211_is_first_frag(hdr->seq_ctrl)) { I802_DEBUG_INC(local->dot11TransmittedFrameCount); if (is_multicast_ether_addr(ieee80211_get_DA(hdr))) I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); if (retry_count > 0) I802_DEBUG_INC(local->dot11RetryCount); if (retry_count > 1) I802_DEBUG_INC(local->dot11MultipleRetryCount); } /* This counter shall be incremented for an acknowledged MPDU * with an individual address in the address 1 field or an MPDU * with a multicast address in the address 1 field of type Data * or Management. */ if (!is_multicast_ether_addr(hdr->addr1) || ieee80211_is_data(fc) || ieee80211_is_mgmt(fc)) I802_DEBUG_INC(local->dot11TransmittedFragmentCount); } else { if (ieee80211_is_first_frag(hdr->seq_ctrl)) I802_DEBUG_INC(local->dot11FailedCount); } if (ieee80211_is_any_nullfunc(fc) && ieee80211_has_pm(fc) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && local->ps_sdata && !(local->scanning)) { if (info->flags & IEEE80211_TX_STAT_ACK) local->ps_sdata->u.mgd.flags |= IEEE80211_STA_NULLFUNC_ACKED; mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(10)); } ieee80211_report_used_skb(local, skb, false); /* this was a transmitted frame, but now we want to reuse it */ skb_orphan(skb); /* Need to make a copy before skb->cb gets cleared */ send_to_cooked = !!(info->flags & IEEE80211_TX_CTL_INJECTED) || !(ieee80211_is_data(fc)); /* * This is a bit racy but we can avoid a lot of work * with this test... */ if (!local->monitors && (!send_to_cooked || !local->cooked_mntrs)) { if (status->free_list) list_add_tail(&skb->list, status->free_list); else dev_kfree_skb(skb); return; } /* send to monitor interfaces */ ieee80211_tx_monitor(local, skb, sband, retry_count, shift, send_to_cooked, status); } void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_status status = { .skb = skb, .info = IEEE80211_SKB_CB(skb), }; struct sta_info *sta; rcu_read_lock(); sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); if (sta) status.sta = &sta->sta; ieee80211_tx_status_ext(hw, &status); rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_tx_status); void ieee80211_tx_status_ext(struct ieee80211_hw *hw, struct ieee80211_tx_status *status) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = status->info; struct ieee80211_sta *pubsta = status->sta; struct sk_buff *skb = status->skb; struct ieee80211_supported_band *sband; struct sta_info *sta = NULL; int rates_idx, retry_count; bool acked, noack_success; u16 tx_time_est; if (pubsta) { sta = container_of(pubsta, struct sta_info, sta); if (status->rate) sta->tx_stats.last_rate_info = *status->rate; } if (skb && (tx_time_est = ieee80211_info_get_tx_time_est(IEEE80211_SKB_CB(skb))) > 0) { /* Do this here to avoid the expensive lookup of the sta * in ieee80211_report_used_skb(). */ ieee80211_sta_update_pending_airtime(local, sta, skb_get_queue_mapping(skb), tx_time_est, true); ieee80211_info_set_tx_time_est(IEEE80211_SKB_CB(skb), 0); } if (!status->info) goto free; rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); sband = hw->wiphy->bands[info->band]; acked = !!(info->flags & IEEE80211_TX_STAT_ACK); noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED); if (pubsta) { struct ieee80211_sub_if_data *sdata = sta->sdata; if (!acked && !noack_success) sta->status_stats.retry_failed++; sta->status_stats.retry_count += retry_count; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { if (sdata->vif.type == NL80211_IFTYPE_STATION && skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP)) ieee80211_sta_tx_notify(sdata, (void *) skb->data, acked, info->status.tx_time); if (acked) { sta->status_stats.last_ack = jiffies; if (sta->status_stats.lost_packets) sta->status_stats.lost_packets = 0; /* Track when last packet was ACKed */ sta->status_stats.last_pkt_time = jiffies; /* Reset connection monitor */ if (sdata->vif.type == NL80211_IFTYPE_STATION && unlikely(sdata->u.mgd.probe_send_count > 0)) sdata->u.mgd.probe_send_count = 0; if (info->status.is_valid_ack_signal) { sta->status_stats.last_ack_signal = (s8)info->status.ack_signal; sta->status_stats.ack_signal_filled = true; ewma_avg_signal_add(&sta->status_stats.avg_ack_signal, -info->status.ack_signal); } } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) { /* * The STA is in power save mode, so assume * that this TX packet failed because of that. */ if (skb) ieee80211_handle_filtered_frame(local, sta, skb); return; } else if (noack_success) { /* nothing to do here, do not account as lost */ } else { ieee80211_lost_packet(sta, info); } } rate_control_tx_status(local, sband, status); if (ieee80211_vif_is_mesh(&sta->sdata->vif)) ieee80211s_update_metric(local, sta, status); } if (skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP)) return __ieee80211_tx_status(hw, status, rates_idx, retry_count); if (acked || noack_success) { I802_DEBUG_INC(local->dot11TransmittedFrameCount); if (!pubsta) I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); if (retry_count > 0) I802_DEBUG_INC(local->dot11RetryCount); if (retry_count > 1) I802_DEBUG_INC(local->dot11MultipleRetryCount); } else { I802_DEBUG_INC(local->dot11FailedCount); } free: if (!skb) return; ieee80211_report_used_skb(local, skb, false); if (status->free_list) list_add_tail(&skb->list, status->free_list); else dev_kfree_skb(skb); } EXPORT_SYMBOL(ieee80211_tx_status_ext); void ieee80211_tx_rate_update(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_tx_info *info) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_supported_band *sband = hw->wiphy->bands[info->band]; struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_tx_status status = { .info = info, .sta = pubsta, }; rate_control_tx_status(local, sband, &status); if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) sta->tx_stats.last_rate = info->status.rates[0]; } EXPORT_SYMBOL(ieee80211_tx_rate_update); void ieee80211_tx_status_8023(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata; struct ieee80211_tx_status status = { .skb = skb, .info = IEEE80211_SKB_CB(skb), }; struct sta_info *sta; sdata = vif_to_sdata(vif); rcu_read_lock(); if (!ieee80211_lookup_ra_sta(sdata, skb, &sta) && !IS_ERR(sta)) status.sta = &sta->sta; ieee80211_tx_status_ext(hw, &status); rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_tx_status_8023); void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, num_packets, GFP_ATOMIC); } EXPORT_SYMBOL(ieee80211_report_low_ack); void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_local *local = hw_to_local(hw); ieee80211_report_used_skb(local, skb, true); dev_kfree_skb_any(skb); } EXPORT_SYMBOL(ieee80211_free_txskb); void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, struct sk_buff_head *skbs) { struct sk_buff *skb; while ((skb = __skb_dequeue(skbs))) ieee80211_free_txskb(hw, skb); } |
1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 4 4 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 | // SPDX-License-Identifier: GPL-2.0-only /* * CAN driver for PEAK System PCAN-USB Pro adapter * Derived from the PCAN project file driver/src/pcan_usbpro.c * * Copyright (C) 2003-2011 PEAK System-Technik GmbH * Copyright (C) 2011-2012 Stephane Grosjean <s.grosjean@peak-system.com> */ #include <linux/netdevice.h> #include <linux/usb.h> #include <linux/module.h> #include <linux/ethtool.h> #include <linux/can.h> #include <linux/can/dev.h> #include <linux/can/error.h> #include "pcan_usb_core.h" #include "pcan_usb_pro.h" #define PCAN_USBPRO_CHANNEL_COUNT 2 /* PCAN-USB Pro adapter internal clock (MHz) */ #define PCAN_USBPRO_CRYSTAL_HZ 56000000 /* PCAN-USB Pro command timeout (ms.) */ #define PCAN_USBPRO_COMMAND_TIMEOUT 1000 /* PCAN-USB Pro rx/tx buffers size */ #define PCAN_USBPRO_RX_BUFFER_SIZE 1024 #define PCAN_USBPRO_TX_BUFFER_SIZE 64 #define PCAN_USBPRO_MSG_HEADER_LEN 4 /* some commands responses need to be re-submitted */ #define PCAN_USBPRO_RSP_SUBMIT_MAX 2 #define PCAN_USBPRO_RTR 0x01 #define PCAN_USBPRO_EXT 0x02 #define PCAN_USBPRO_SS 0x08 #define PCAN_USBPRO_CMD_BUFFER_SIZE 512 /* handle device specific info used by the netdevices */ struct pcan_usb_pro_interface { struct peak_usb_device *dev[PCAN_USBPRO_CHANNEL_COUNT]; struct peak_time_ref time_ref; int cm_ignore_count; int dev_opened_count; }; /* device information */ struct pcan_usb_pro_device { struct peak_usb_device dev; struct pcan_usb_pro_interface *usb_if; u32 cached_ccbt; }; /* internal structure used to handle messages sent to bulk urb */ struct pcan_usb_pro_msg { u8 *rec_ptr; int rec_buffer_size; int rec_buffer_len; union { __le16 *rec_cnt_rd; __le32 *rec_cnt; u8 *rec_buffer; } u; }; /* records sizes table indexed on message id. (8-bits value) */ static u16 pcan_usb_pro_sizeof_rec[256] = { [PCAN_USBPRO_SETBTR] = sizeof(struct pcan_usb_pro_btr), [PCAN_USBPRO_SETBUSACT] = sizeof(struct pcan_usb_pro_busact), [PCAN_USBPRO_SETSILENT] = sizeof(struct pcan_usb_pro_silent), [PCAN_USBPRO_SETFILTR] = sizeof(struct pcan_usb_pro_filter), [PCAN_USBPRO_SETTS] = sizeof(struct pcan_usb_pro_setts), [PCAN_USBPRO_GETDEVID] = sizeof(struct pcan_usb_pro_devid), [PCAN_USBPRO_SETLED] = sizeof(struct pcan_usb_pro_setled), [PCAN_USBPRO_RXMSG8] = sizeof(struct pcan_usb_pro_rxmsg), [PCAN_USBPRO_RXMSG4] = sizeof(struct pcan_usb_pro_rxmsg) - 4, [PCAN_USBPRO_RXMSG0] = sizeof(struct pcan_usb_pro_rxmsg) - 8, [PCAN_USBPRO_RXRTR] = sizeof(struct pcan_usb_pro_rxmsg) - 8, [PCAN_USBPRO_RXSTATUS] = sizeof(struct pcan_usb_pro_rxstatus), [PCAN_USBPRO_RXTS] = sizeof(struct pcan_usb_pro_rxts), [PCAN_USBPRO_TXMSG8] = sizeof(struct pcan_usb_pro_txmsg), [PCAN_USBPRO_TXMSG4] = sizeof(struct pcan_usb_pro_txmsg) - 4, [PCAN_USBPRO_TXMSG0] = sizeof(struct pcan_usb_pro_txmsg) - 8, }; /* * initialize PCAN-USB Pro message data structure */ static u8 *pcan_msg_init(struct pcan_usb_pro_msg *pm, void *buffer_addr, int buffer_size) { if (buffer_size < PCAN_USBPRO_MSG_HEADER_LEN) return NULL; pm->u.rec_buffer = (u8 *)buffer_addr; pm->rec_buffer_size = pm->rec_buffer_len = buffer_size; pm->rec_ptr = pm->u.rec_buffer + PCAN_USBPRO_MSG_HEADER_LEN; return pm->rec_ptr; } static u8 *pcan_msg_init_empty(struct pcan_usb_pro_msg *pm, void *buffer_addr, int buffer_size) { u8 *pr = pcan_msg_init(pm, buffer_addr, buffer_size); if (pr) { pm->rec_buffer_len = PCAN_USBPRO_MSG_HEADER_LEN; *pm->u.rec_cnt = 0; } return pr; } /* * add one record to a message being built */ static int pcan_msg_add_rec(struct pcan_usb_pro_msg *pm, int id, ...) { int len, i; u8 *pc; va_list ap; va_start(ap, id); pc = pm->rec_ptr + 1; i = 0; switch (id) { case PCAN_USBPRO_TXMSG8: i += 4; fallthrough; case PCAN_USBPRO_TXMSG4: i += 4; fallthrough; case PCAN_USBPRO_TXMSG0: *pc++ = va_arg(ap, int); *pc++ = va_arg(ap, int); *pc++ = va_arg(ap, int); *(__le32 *)pc = cpu_to_le32(va_arg(ap, u32)); pc += 4; memcpy(pc, va_arg(ap, int *), i); pc += i; break; case PCAN_USBPRO_SETBTR: case PCAN_USBPRO_GETDEVID: *pc++ = va_arg(ap, int); pc += 2; *(__le32 *)pc = cpu_to_le32(va_arg(ap, u32)); pc += 4; break; case PCAN_USBPRO_SETFILTR: case PCAN_USBPRO_SETBUSACT: case PCAN_USBPRO_SETSILENT: *pc++ = va_arg(ap, int); *(__le16 *)pc = cpu_to_le16(va_arg(ap, int)); pc += 2; break; case PCAN_USBPRO_SETLED: *pc++ = va_arg(ap, int); *(__le16 *)pc = cpu_to_le16(va_arg(ap, int)); pc += 2; *(__le32 *)pc = cpu_to_le32(va_arg(ap, u32)); pc += 4; break; case PCAN_USBPRO_SETTS: pc++; *(__le16 *)pc = cpu_to_le16(va_arg(ap, int)); pc += 2; break; default: pr_err("%s: %s(): unknown data type %02Xh (%d)\n", PCAN_USB_DRIVER_NAME, __func__, id, id); pc--; break; } len = pc - pm->rec_ptr; if (len > 0) { le32_add_cpu(pm->u.rec_cnt, 1); *pm->rec_ptr = id; pm->rec_ptr = pc; pm->rec_buffer_len += len; } va_end(ap); return len; } /* * send PCAN-USB Pro command synchronously */ static int pcan_usb_pro_send_cmd(struct peak_usb_device *dev, struct pcan_usb_pro_msg *pum) { int actual_length; int err; /* usb device unregistered? */ if (!(dev->state & PCAN_USB_STATE_CONNECTED)) return 0; err = usb_bulk_msg(dev->udev, usb_sndbulkpipe(dev->udev, PCAN_USBPRO_EP_CMDOUT), pum->u.rec_buffer, pum->rec_buffer_len, &actual_length, PCAN_USBPRO_COMMAND_TIMEOUT); if (err) netdev_err(dev->netdev, "sending command failure: %d\n", err); return err; } /* * wait for PCAN-USB Pro command response */ static int pcan_usb_pro_wait_rsp(struct peak_usb_device *dev, struct pcan_usb_pro_msg *pum) { u8 req_data_type, req_channel; int actual_length; int i, err = 0; /* usb device unregistered? */ if (!(dev->state & PCAN_USB_STATE_CONNECTED)) return 0; req_data_type = pum->u.rec_buffer[4]; req_channel = pum->u.rec_buffer[5]; *pum->u.rec_cnt = 0; for (i = 0; !err && i < PCAN_USBPRO_RSP_SUBMIT_MAX; i++) { struct pcan_usb_pro_msg rsp; union pcan_usb_pro_rec *pr; u32 r, rec_cnt; u16 rec_len; u8 *pc; err = usb_bulk_msg(dev->udev, usb_rcvbulkpipe(dev->udev, PCAN_USBPRO_EP_CMDIN), pum->u.rec_buffer, pum->rec_buffer_len, &actual_length, PCAN_USBPRO_COMMAND_TIMEOUT); if (err) { netdev_err(dev->netdev, "waiting rsp error %d\n", err); break; } if (actual_length == 0) continue; err = -EBADMSG; if (actual_length < PCAN_USBPRO_MSG_HEADER_LEN) { netdev_err(dev->netdev, "got abnormal too small rsp (len=%d)\n", actual_length); break; } pc = pcan_msg_init(&rsp, pum->u.rec_buffer, actual_length); rec_cnt = le32_to_cpu(*rsp.u.rec_cnt); /* loop on records stored into message */ for (r = 0; r < rec_cnt; r++) { pr = (union pcan_usb_pro_rec *)pc; rec_len = pcan_usb_pro_sizeof_rec[pr->data_type]; if (!rec_len) { netdev_err(dev->netdev, "got unprocessed record in msg\n"); pcan_dump_mem("rcvd rsp msg", pum->u.rec_buffer, actual_length); break; } /* check if response corresponds to request */ if (pr->data_type != req_data_type) netdev_err(dev->netdev, "got unwanted rsp %xh: ignored\n", pr->data_type); /* check if channel in response corresponds too */ else if ((req_channel != 0xff) && (pr->bus_act.channel != req_channel)) netdev_err(dev->netdev, "got rsp %xh but on chan%u: ignored\n", req_data_type, pr->bus_act.channel); /* got the response */ else return 0; /* otherwise, go on with next record in message */ pc += rec_len; } } return (i >= PCAN_USBPRO_RSP_SUBMIT_MAX) ? -ERANGE : err; } int pcan_usb_pro_send_req(struct peak_usb_device *dev, int req_id, int req_value, void *req_addr, int req_size) { int err; u8 req_type; unsigned int p; /* usb device unregistered? */ if (!(dev->state & PCAN_USB_STATE_CONNECTED)) return 0; req_type = USB_TYPE_VENDOR | USB_RECIP_OTHER; switch (req_id) { case PCAN_USBPRO_REQ_FCT: p = usb_sndctrlpipe(dev->udev, 0); break; default: p = usb_rcvctrlpipe(dev->udev, 0); req_type |= USB_DIR_IN; memset(req_addr, '\0', req_size); break; } err = usb_control_msg(dev->udev, p, req_id, req_type, req_value, 0, req_addr, req_size, 2 * USB_CTRL_GET_TIMEOUT); if (err < 0) { netdev_info(dev->netdev, "unable to request usb[type=%d value=%d] err=%d\n", req_id, req_value, err); return err; } return 0; } static int pcan_usb_pro_set_ts(struct peak_usb_device *dev, u16 onoff) { struct pcan_usb_pro_msg um; pcan_msg_init_empty(&um, dev->cmd_buf, PCAN_USB_MAX_CMD_LEN); pcan_msg_add_rec(&um, PCAN_USBPRO_SETTS, onoff); return pcan_usb_pro_send_cmd(dev, &um); } static int pcan_usb_pro_set_bitrate(struct peak_usb_device *dev, u32 ccbt) { struct pcan_usb_pro_device *pdev = container_of(dev, struct pcan_usb_pro_device, dev); struct pcan_usb_pro_msg um; pcan_msg_init_empty(&um, dev->cmd_buf, PCAN_USB_MAX_CMD_LEN); pcan_msg_add_rec(&um, PCAN_USBPRO_SETBTR, dev->ctrl_idx, ccbt); /* cache the CCBT value to reuse it before next buson */ pdev->cached_ccbt = ccbt; return pcan_usb_pro_send_cmd(dev, &um); } static int pcan_usb_pro_set_bus(struct peak_usb_device *dev, u8 onoff) { struct pcan_usb_pro_msg um; /* if bus=on, be sure the bitrate being set before! */ if (onoff) { struct pcan_usb_pro_device *pdev = container_of(dev, struct pcan_usb_pro_device, dev); pcan_usb_pro_set_bitrate(dev, pdev->cached_ccbt); } pcan_msg_init_empty(&um, dev->cmd_buf, PCAN_USB_MAX_CMD_LEN); pcan_msg_add_rec(&um, PCAN_USBPRO_SETBUSACT, dev->ctrl_idx, onoff); return pcan_usb_pro_send_cmd(dev, &um); } static int pcan_usb_pro_set_silent(struct peak_usb_device *dev, u8 onoff) { struct pcan_usb_pro_msg um; pcan_msg_init_empty(&um, dev->cmd_buf, PCAN_USB_MAX_CMD_LEN); pcan_msg_add_rec(&um, PCAN_USBPRO_SETSILENT, dev->ctrl_idx, onoff); return pcan_usb_pro_send_cmd(dev, &um); } static int pcan_usb_pro_set_filter(struct peak_usb_device *dev, u16 filter_mode) { struct pcan_usb_pro_msg um; pcan_msg_init_empty(&um, dev->cmd_buf, PCAN_USB_MAX_CMD_LEN); pcan_msg_add_rec(&um, PCAN_USBPRO_SETFILTR, dev->ctrl_idx, filter_mode); return pcan_usb_pro_send_cmd(dev, &um); } static int pcan_usb_pro_set_led(struct peak_usb_device *dev, u8 mode, u32 timeout) { struct pcan_usb_pro_msg um; pcan_msg_init_empty(&um, dev->cmd_buf, PCAN_USB_MAX_CMD_LEN); pcan_msg_add_rec(&um, PCAN_USBPRO_SETLED, dev->ctrl_idx, mode, timeout); return pcan_usb_pro_send_cmd(dev, &um); } static int pcan_usb_pro_get_device_id(struct peak_usb_device *dev, u32 *device_id) { struct pcan_usb_pro_devid *pdn; struct pcan_usb_pro_msg um; int err; u8 *pc; pc = pcan_msg_init_empty(&um, dev->cmd_buf, PCAN_USB_MAX_CMD_LEN); pcan_msg_add_rec(&um, PCAN_USBPRO_GETDEVID, dev->ctrl_idx); err = pcan_usb_pro_send_cmd(dev, &um); if (err) return err; err = pcan_usb_pro_wait_rsp(dev, &um); if (err) return err; pdn = (struct pcan_usb_pro_devid *)pc; *device_id = le32_to_cpu(pdn->serial_num); return err; } static int pcan_usb_pro_set_bittiming(struct peak_usb_device *dev, struct can_bittiming *bt) { u32 ccbt; ccbt = (dev->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES) ? 0x00800000 : 0; ccbt |= (bt->sjw - 1) << 24; ccbt |= (bt->phase_seg2 - 1) << 20; ccbt |= (bt->prop_seg + bt->phase_seg1 - 1) << 16; /* = tseg1 */ ccbt |= bt->brp - 1; netdev_info(dev->netdev, "setting ccbt=0x%08x\n", ccbt); return pcan_usb_pro_set_bitrate(dev, ccbt); } void pcan_usb_pro_restart_complete(struct urb *urb) { /* can delete usb resources */ peak_usb_async_complete(urb); /* notify candev and netdev */ peak_usb_restart_complete(urb->context); } /* * handle restart but in asynchronously way */ static int pcan_usb_pro_restart_async(struct peak_usb_device *dev, struct urb *urb, u8 *buf) { struct pcan_usb_pro_msg um; pcan_msg_init_empty(&um, buf, PCAN_USB_MAX_CMD_LEN); pcan_msg_add_rec(&um, PCAN_USBPRO_SETBUSACT, dev->ctrl_idx, 1); usb_fill_bulk_urb(urb, dev->udev, usb_sndbulkpipe(dev->udev, PCAN_USBPRO_EP_CMDOUT), buf, PCAN_USB_MAX_CMD_LEN, pcan_usb_pro_restart_complete, dev); return usb_submit_urb(urb, GFP_ATOMIC); } static int pcan_usb_pro_drv_loaded(struct peak_usb_device *dev, int loaded) { u8 *buffer; int err; buffer = kzalloc(PCAN_USBPRO_FCT_DRVLD_REQ_LEN, GFP_KERNEL); if (!buffer) return -ENOMEM; buffer[0] = 0; buffer[1] = !!loaded; err = pcan_usb_pro_send_req(dev, PCAN_USBPRO_REQ_FCT, PCAN_USBPRO_FCT_DRVLD, buffer, PCAN_USBPRO_FCT_DRVLD_REQ_LEN); kfree(buffer); return err; } static inline struct pcan_usb_pro_interface *pcan_usb_pro_dev_if(struct peak_usb_device *dev) { struct pcan_usb_pro_device *pdev = container_of(dev, struct pcan_usb_pro_device, dev); return pdev->usb_if; } static int pcan_usb_pro_handle_canmsg(struct pcan_usb_pro_interface *usb_if, struct pcan_usb_pro_rxmsg *rx) { const unsigned int ctrl_idx = (rx->len >> 4) & 0x0f; struct peak_usb_device *dev = usb_if->dev[ctrl_idx]; struct net_device *netdev = dev->netdev; struct can_frame *can_frame; struct sk_buff *skb; struct skb_shared_hwtstamps *hwts; skb = alloc_can_skb(netdev, &can_frame); if (!skb) return -ENOMEM; can_frame->can_id = le32_to_cpu(rx->id); can_frame->len = rx->len & 0x0f; if (rx->flags & PCAN_USBPRO_EXT) can_frame->can_id |= CAN_EFF_FLAG; if (rx->flags & PCAN_USBPRO_RTR) can_frame->can_id |= CAN_RTR_FLAG; else memcpy(can_frame->data, rx->data, can_frame->len); hwts = skb_hwtstamps(skb); peak_usb_get_ts_time(&usb_if->time_ref, le32_to_cpu(rx->ts32), &hwts->hwtstamp); netdev->stats.rx_packets++; netdev->stats.rx_bytes += can_frame->len; netif_rx(skb); return 0; } static int pcan_usb_pro_handle_error(struct pcan_usb_pro_interface *usb_if, struct pcan_usb_pro_rxstatus *er) { const u16 raw_status = le16_to_cpu(er->status); const unsigned int ctrl_idx = (er->channel >> 4) & 0x0f; struct peak_usb_device *dev = usb_if->dev[ctrl_idx]; struct net_device *netdev = dev->netdev; struct can_frame *can_frame; enum can_state new_state = CAN_STATE_ERROR_ACTIVE; u8 err_mask = 0; struct sk_buff *skb; struct skb_shared_hwtstamps *hwts; /* nothing should be sent while in BUS_OFF state */ if (dev->can.state == CAN_STATE_BUS_OFF) return 0; if (!raw_status) { /* no error bit (back to active state) */ dev->can.state = CAN_STATE_ERROR_ACTIVE; return 0; } if (raw_status & (PCAN_USBPRO_STATUS_OVERRUN | PCAN_USBPRO_STATUS_QOVERRUN)) { /* trick to bypass next comparison and process other errors */ new_state = CAN_STATE_MAX; } if (raw_status & PCAN_USBPRO_STATUS_BUS) { new_state = CAN_STATE_BUS_OFF; } else if (raw_status & PCAN_USBPRO_STATUS_ERROR) { u32 rx_err_cnt = (le32_to_cpu(er->err_frm) & 0x00ff0000) >> 16; u32 tx_err_cnt = (le32_to_cpu(er->err_frm) & 0xff000000) >> 24; if (rx_err_cnt > 127) err_mask |= CAN_ERR_CRTL_RX_PASSIVE; else if (rx_err_cnt > 96) err_mask |= CAN_ERR_CRTL_RX_WARNING; if (tx_err_cnt > 127) err_mask |= CAN_ERR_CRTL_TX_PASSIVE; else if (tx_err_cnt > 96) err_mask |= CAN_ERR_CRTL_TX_WARNING; if (err_mask & (CAN_ERR_CRTL_RX_WARNING | CAN_ERR_CRTL_TX_WARNING)) new_state = CAN_STATE_ERROR_WARNING; else if (err_mask & (CAN_ERR_CRTL_RX_PASSIVE | CAN_ERR_CRTL_TX_PASSIVE)) new_state = CAN_STATE_ERROR_PASSIVE; } /* donot post any error if current state didn't change */ if (dev->can.state == new_state) return 0; /* allocate an skb to store the error frame */ skb = alloc_can_err_skb(netdev, &can_frame); if (!skb) return -ENOMEM; switch (new_state) { case CAN_STATE_BUS_OFF: can_frame->can_id |= CAN_ERR_BUSOFF; dev->can.can_stats.bus_off++; can_bus_off(netdev); break; case CAN_STATE_ERROR_PASSIVE: can_frame->can_id |= CAN_ERR_CRTL; can_frame->data[1] |= err_mask; dev->can.can_stats.error_passive++; break; case CAN_STATE_ERROR_WARNING: can_frame->can_id |= CAN_ERR_CRTL; can_frame->data[1] |= err_mask; dev->can.can_stats.error_warning++; break; case CAN_STATE_ERROR_ACTIVE: break; default: /* CAN_STATE_MAX (trick to handle other errors) */ if (raw_status & PCAN_USBPRO_STATUS_OVERRUN) { can_frame->can_id |= CAN_ERR_PROT; can_frame->data[2] |= CAN_ERR_PROT_OVERLOAD; netdev->stats.rx_over_errors++; netdev->stats.rx_errors++; } if (raw_status & PCAN_USBPRO_STATUS_QOVERRUN) { can_frame->can_id |= CAN_ERR_CRTL; can_frame->data[1] |= CAN_ERR_CRTL_RX_OVERFLOW; netdev->stats.rx_over_errors++; netdev->stats.rx_errors++; } new_state = CAN_STATE_ERROR_ACTIVE; break; } dev->can.state = new_state; hwts = skb_hwtstamps(skb); peak_usb_get_ts_time(&usb_if->time_ref, le32_to_cpu(er->ts32), &hwts->hwtstamp); netif_rx(skb); return 0; } static void pcan_usb_pro_handle_ts(struct pcan_usb_pro_interface *usb_if, struct pcan_usb_pro_rxts *ts) { /* should wait until clock is stabilized */ if (usb_if->cm_ignore_count > 0) usb_if->cm_ignore_count--; else peak_usb_set_ts_now(&usb_if->time_ref, le32_to_cpu(ts->ts64[1])); } /* * callback for bulk IN urb */ static int pcan_usb_pro_decode_buf(struct peak_usb_device *dev, struct urb *urb) { struct pcan_usb_pro_interface *usb_if = pcan_usb_pro_dev_if(dev); struct net_device *netdev = dev->netdev; struct pcan_usb_pro_msg usb_msg; u8 *rec_ptr, *msg_end; u16 rec_cnt; int err = 0; rec_ptr = pcan_msg_init(&usb_msg, urb->transfer_buffer, urb->actual_length); if (!rec_ptr) { netdev_err(netdev, "bad msg hdr len %d\n", urb->actual_length); return -EINVAL; } /* loop reading all the records from the incoming message */ msg_end = urb->transfer_buffer + urb->actual_length; rec_cnt = le16_to_cpu(*usb_msg.u.rec_cnt_rd); for (; rec_cnt > 0; rec_cnt--) { union pcan_usb_pro_rec *pr = (union pcan_usb_pro_rec *)rec_ptr; u16 sizeof_rec = pcan_usb_pro_sizeof_rec[pr->data_type]; if (!sizeof_rec) { netdev_err(netdev, "got unsupported rec in usb msg:\n"); err = -ENOTSUPP; break; } /* check if the record goes out of current packet */ if (rec_ptr + sizeof_rec > msg_end) { netdev_err(netdev, "got frag rec: should inc usb rx buf size\n"); err = -EBADMSG; break; } switch (pr->data_type) { case PCAN_USBPRO_RXMSG8: case PCAN_USBPRO_RXMSG4: case PCAN_USBPRO_RXMSG0: case PCAN_USBPRO_RXRTR: err = pcan_usb_pro_handle_canmsg(usb_if, &pr->rx_msg); if (err < 0) goto fail; break; case PCAN_USBPRO_RXSTATUS: err = pcan_usb_pro_handle_error(usb_if, &pr->rx_status); if (err < 0) goto fail; break; case PCAN_USBPRO_RXTS: pcan_usb_pro_handle_ts(usb_if, &pr->rx_ts); break; default: netdev_err(netdev, "unhandled rec type 0x%02x (%d): ignored\n", pr->data_type, pr->data_type); break; } rec_ptr += sizeof_rec; } fail: if (err) pcan_dump_mem("received msg", urb->transfer_buffer, urb->actual_length); return err; } static int pcan_usb_pro_encode_msg(struct peak_usb_device *dev, struct sk_buff *skb, u8 *obuf, size_t *size) { struct can_frame *cf = (struct can_frame *)skb->data; u8 data_type, len, flags; struct pcan_usb_pro_msg usb_msg; pcan_msg_init_empty(&usb_msg, obuf, *size); if ((cf->can_id & CAN_RTR_FLAG) || (cf->len == 0)) data_type = PCAN_USBPRO_TXMSG0; else if (cf->len <= 4) data_type = PCAN_USBPRO_TXMSG4; else data_type = PCAN_USBPRO_TXMSG8; len = (dev->ctrl_idx << 4) | (cf->len & 0x0f); flags = 0; if (cf->can_id & CAN_EFF_FLAG) flags |= PCAN_USBPRO_EXT; if (cf->can_id & CAN_RTR_FLAG) flags |= PCAN_USBPRO_RTR; /* Single-Shot frame */ if (dev->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT) flags |= PCAN_USBPRO_SS; pcan_msg_add_rec(&usb_msg, data_type, 0, flags, len, cf->can_id, cf->data); *size = usb_msg.rec_buffer_len; return 0; } static int pcan_usb_pro_start(struct peak_usb_device *dev) { struct pcan_usb_pro_device *pdev = container_of(dev, struct pcan_usb_pro_device, dev); int err; err = pcan_usb_pro_set_silent(dev, dev->can.ctrlmode & CAN_CTRLMODE_LISTENONLY); if (err) return err; /* filter mode: 0-> All OFF; 1->bypass */ err = pcan_usb_pro_set_filter(dev, 1); if (err) return err; /* opening first device: */ if (pdev->usb_if->dev_opened_count == 0) { /* reset time_ref */ peak_usb_init_time_ref(&pdev->usb_if->time_ref, &pcan_usb_pro); /* ask device to send ts messages */ err = pcan_usb_pro_set_ts(dev, 1); } pdev->usb_if->dev_opened_count++; return err; } /* * stop interface * (last chance before set bus off) */ static int pcan_usb_pro_stop(struct peak_usb_device *dev) { struct pcan_usb_pro_device *pdev = container_of(dev, struct pcan_usb_pro_device, dev); /* turn off ts msgs for that interface if no other dev opened */ if (pdev->usb_if->dev_opened_count == 1) pcan_usb_pro_set_ts(dev, 0); pdev->usb_if->dev_opened_count--; return 0; } /* * called when probing to initialize a device object. */ static int pcan_usb_pro_init(struct peak_usb_device *dev) { struct pcan_usb_pro_device *pdev = container_of(dev, struct pcan_usb_pro_device, dev); struct pcan_usb_pro_interface *usb_if = NULL; struct pcan_usb_pro_fwinfo *fi = NULL; struct pcan_usb_pro_blinfo *bi = NULL; int err; /* do this for 1st channel only */ if (!dev->prev_siblings) { /* allocate netdevices common structure attached to first one */ usb_if = kzalloc(sizeof(struct pcan_usb_pro_interface), GFP_KERNEL); fi = kmalloc(sizeof(struct pcan_usb_pro_fwinfo), GFP_KERNEL); bi = kmalloc(sizeof(struct pcan_usb_pro_blinfo), GFP_KERNEL); if (!usb_if || !fi || !bi) { err = -ENOMEM; goto err_out; } /* number of ts msgs to ignore before taking one into account */ usb_if->cm_ignore_count = 5; /* * explicit use of dev_xxx() instead of netdev_xxx() here: * information displayed are related to the device itself, not * to the canx netdevices. */ err = pcan_usb_pro_send_req(dev, PCAN_USBPRO_REQ_INFO, PCAN_USBPRO_INFO_FW, fi, sizeof(*fi)); if (err) { dev_err(dev->netdev->dev.parent, "unable to read %s firmware info (err %d)\n", pcan_usb_pro.name, err); goto err_out; } err = pcan_usb_pro_send_req(dev, PCAN_USBPRO_REQ_INFO, PCAN_USBPRO_INFO_BL, bi, sizeof(*bi)); if (err) { dev_err(dev->netdev->dev.parent, "unable to read %s bootloader info (err %d)\n", pcan_usb_pro.name, err); goto err_out; } /* tell the device the can driver is running */ err = pcan_usb_pro_drv_loaded(dev, 1); if (err) goto err_out; dev_info(dev->netdev->dev.parent, "PEAK-System %s hwrev %u serial %08X.%08X (%u channels)\n", pcan_usb_pro.name, bi->hw_rev, bi->serial_num_hi, bi->serial_num_lo, pcan_usb_pro.ctrl_count); } else { usb_if = pcan_usb_pro_dev_if(dev->prev_siblings); } pdev->usb_if = usb_if; usb_if->dev[dev->ctrl_idx] = dev; /* set LED in default state (end of init phase) */ pcan_usb_pro_set_led(dev, PCAN_USBPRO_LED_DEVICE, 1); kfree(bi); kfree(fi); return 0; err_out: kfree(bi); kfree(fi); kfree(usb_if); return err; } static void pcan_usb_pro_exit(struct peak_usb_device *dev) { struct pcan_usb_pro_device *pdev = container_of(dev, struct pcan_usb_pro_device, dev); /* * when rmmod called before unplug and if down, should reset things * before leaving */ if (dev->can.state != CAN_STATE_STOPPED) { /* set bus off on the corresponding channel */ pcan_usb_pro_set_bus(dev, 0); } /* if channel #0 (only) */ if (dev->ctrl_idx == 0) { /* turn off calibration message if any device were opened */ if (pdev->usb_if->dev_opened_count > 0) pcan_usb_pro_set_ts(dev, 0); /* tell the PCAN-USB Pro device the driver is being unloaded */ pcan_usb_pro_drv_loaded(dev, 0); } } /* * called when PCAN-USB Pro adapter is unplugged */ static void pcan_usb_pro_free(struct peak_usb_device *dev) { /* last device: can free pcan_usb_pro_interface object now */ if (!dev->prev_siblings && !dev->next_siblings) kfree(pcan_usb_pro_dev_if(dev)); } /* * probe function for new PCAN-USB Pro usb interface */ int pcan_usb_pro_probe(struct usb_interface *intf) { struct usb_host_interface *if_desc; int i; if_desc = intf->altsetting; /* check interface endpoint addresses */ for (i = 0; i < if_desc->desc.bNumEndpoints; i++) { struct usb_endpoint_descriptor *ep = &if_desc->endpoint[i].desc; /* * below is the list of valid ep addresses. Any other ep address * is considered as not-CAN interface address => no dev created */ switch (ep->bEndpointAddress) { case PCAN_USBPRO_EP_CMDOUT: case PCAN_USBPRO_EP_CMDIN: case PCAN_USBPRO_EP_MSGOUT_0: case PCAN_USBPRO_EP_MSGOUT_1: case PCAN_USBPRO_EP_MSGIN: case PCAN_USBPRO_EP_UNUSED: break; default: return -ENODEV; } } return 0; } static int pcan_usb_pro_set_phys_id(struct net_device *netdev, enum ethtool_phys_id_state state) { struct peak_usb_device *dev = netdev_priv(netdev); int err = 0; switch (state) { case ETHTOOL_ID_ACTIVE: /* fast blinking forever */ err = pcan_usb_pro_set_led(dev, PCAN_USBPRO_LED_BLINK_FAST, 0xffffffff); break; case ETHTOOL_ID_INACTIVE: /* restore LED default */ err = pcan_usb_pro_set_led(dev, PCAN_USBPRO_LED_DEVICE, 1); break; default: break; } return err; } static const struct ethtool_ops pcan_usb_pro_ethtool_ops = { .set_phys_id = pcan_usb_pro_set_phys_id, }; /* * describe the PCAN-USB Pro adapter */ static const struct can_bittiming_const pcan_usb_pro_const = { .name = "pcan_usb_pro", .tseg1_min = 1, .tseg1_max = 16, .tseg2_min = 1, .tseg2_max = 8, .sjw_max = 4, .brp_min = 1, .brp_max = 1024, .brp_inc = 1, }; const struct peak_usb_adapter pcan_usb_pro = { .name = "PCAN-USB Pro", .device_id = PCAN_USBPRO_PRODUCT_ID, .ctrl_count = PCAN_USBPRO_CHANNEL_COUNT, .ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES | CAN_CTRLMODE_LISTENONLY | CAN_CTRLMODE_ONE_SHOT, .clock = { .freq = PCAN_USBPRO_CRYSTAL_HZ, }, .bittiming_const = &pcan_usb_pro_const, /* size of device private data */ .sizeof_dev_private = sizeof(struct pcan_usb_pro_device), .ethtool_ops = &pcan_usb_pro_ethtool_ops, /* timestamps usage */ .ts_used_bits = 32, .us_per_ts_scale = 1, /* us = (ts * scale) >> shift */ .us_per_ts_shift = 0, /* give here messages in/out endpoints */ .ep_msg_in = PCAN_USBPRO_EP_MSGIN, .ep_msg_out = {PCAN_USBPRO_EP_MSGOUT_0, PCAN_USBPRO_EP_MSGOUT_1}, /* size of rx/tx usb buffers */ .rx_buffer_size = PCAN_USBPRO_RX_BUFFER_SIZE, .tx_buffer_size = PCAN_USBPRO_TX_BUFFER_SIZE, /* device callbacks */ .intf_probe = pcan_usb_pro_probe, .dev_init = pcan_usb_pro_init, .dev_exit = pcan_usb_pro_exit, .dev_free = pcan_usb_pro_free, .dev_set_bus = pcan_usb_pro_set_bus, .dev_set_bittiming = pcan_usb_pro_set_bittiming, .dev_get_device_id = pcan_usb_pro_get_device_id, .dev_decode_buf = pcan_usb_pro_decode_buf, .dev_encode_msg = pcan_usb_pro_encode_msg, .dev_start = pcan_usb_pro_start, .dev_stop = pcan_usb_pro_stop, .dev_restart_async = pcan_usb_pro_restart_async, }; |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_RTNETLINK_H #define __NET_RTNETLINK_H #include <linux/rtnetlink.h> #include <net/netlink.h> typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *); typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); enum rtnl_link_flags { RTNL_FLAG_DOIT_UNLOCKED = 1, }; enum rtnl_kinds { RTNL_KIND_NEW, RTNL_KIND_DEL, RTNL_KIND_GET, RTNL_KIND_SET }; struct rtnl_msg_handler { struct module *owner; int protocol; int msgtype; rtnl_doit_func doit; rtnl_dumpit_func dumpit; int flags; }; void rtnl_register(int protocol, int msgtype, rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); int rtnl_register_module(struct module *owner, int protocol, int msgtype, rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); int rtnl_unregister(int protocol, int msgtype); void rtnl_unregister_all(int protocol); int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n); void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n); #define rtnl_register_many(handlers) \ __rtnl_register_many(handlers, ARRAY_SIZE(handlers)) #define rtnl_unregister_many(handlers) \ __rtnl_unregister_many(handlers, ARRAY_SIZE(handlers)) static inline int rtnl_msg_family(const struct nlmsghdr *nlh) { if (nlmsg_len(nlh) >= sizeof(struct rtgenmsg)) return ((struct rtgenmsg *) nlmsg_data(nlh))->rtgen_family; else return AF_UNSPEC; } /** * struct rtnl_link_ops - rtnetlink link operations * * @list: Used internally * @kind: Identifier * @netns_refund: Physical device, move to init_net on netns exit * @maxtype: Highest device specific netlink attribute number * @policy: Netlink policy for device specific attribute validation * @validate: Optional validation function for netlink/changelink parameters * @alloc: netdev allocation function, can be %NULL and is then used * in place of alloc_netdev_mqs(), in this case @priv_size * and @setup are unused. Returns a netdev or ERR_PTR(). * @priv_size: sizeof net_device private space * @setup: net_device setup function * @newlink: Function for configuring and registering a new device * @changelink: Function for changing parameters of an existing device * @dellink: Function to remove a device * @get_size: Function to calculate required room for dumping device * specific netlink attributes * @fill_info: Function to dump device specific netlink attributes * @get_xstats_size: Function to calculate required room for dumping device * specific statistics * @fill_xstats: Function to dump device specific statistics * @get_num_tx_queues: Function to determine number of transmit queues * to create when creating a new device. * @get_num_rx_queues: Function to determine number of receive queues * to create when creating a new device. * @get_link_net: Function to get the i/o netns of the device * @get_linkxstats_size: Function to calculate the required room for * dumping device-specific extended link stats * @fill_linkxstats: Function to dump device-specific extended link stats */ struct rtnl_link_ops { struct list_head list; const char *kind; size_t priv_size; struct net_device *(*alloc)(struct nlattr *tb[], const char *ifname, unsigned char name_assign_type, unsigned int num_tx_queues, unsigned int num_rx_queues); void (*setup)(struct net_device *dev); bool netns_refund; unsigned int maxtype; const struct nla_policy *policy; int (*validate)(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack); int (*newlink)(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack); int (*changelink)(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack); void (*dellink)(struct net_device *dev, struct list_head *head); size_t (*get_size)(const struct net_device *dev); int (*fill_info)(struct sk_buff *skb, const struct net_device *dev); size_t (*get_xstats_size)(const struct net_device *dev); int (*fill_xstats)(struct sk_buff *skb, const struct net_device *dev); unsigned int (*get_num_tx_queues)(void); unsigned int (*get_num_rx_queues)(void); unsigned int slave_maxtype; const struct nla_policy *slave_policy; int (*slave_changelink)(struct net_device *dev, struct net_device *slave_dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack); size_t (*get_slave_size)(const struct net_device *dev, const struct net_device *slave_dev); int (*fill_slave_info)(struct sk_buff *skb, const struct net_device *dev, const struct net_device *slave_dev); struct net *(*get_link_net)(const struct net_device *dev); size_t (*get_linkxstats_size)(const struct net_device *dev, int attr); int (*fill_linkxstats)(struct sk_buff *skb, const struct net_device *dev, int *prividx, int attr); }; int __rtnl_link_register(struct rtnl_link_ops *ops); void __rtnl_link_unregister(struct rtnl_link_ops *ops); int rtnl_link_register(struct rtnl_link_ops *ops); void rtnl_link_unregister(struct rtnl_link_ops *ops); /** * struct rtnl_af_ops - rtnetlink address family operations * * @list: Used internally * @family: Address family * @fill_link_af: Function to fill IFLA_AF_SPEC with address family * specific netlink attributes. * @get_link_af_size: Function to calculate size of address family specific * netlink attributes. * @validate_link_af: Validate a IFLA_AF_SPEC attribute, must check attr * for invalid configuration settings. * @set_link_af: Function to parse a IFLA_AF_SPEC attribute and modify * net_device accordingly. */ struct rtnl_af_ops { struct list_head list; int family; int (*fill_link_af)(struct sk_buff *skb, const struct net_device *dev, u32 ext_filter_mask); size_t (*get_link_af_size)(const struct net_device *dev, u32 ext_filter_mask); int (*validate_link_af)(const struct net_device *dev, const struct nlattr *attr, struct netlink_ext_ack *extack); int (*set_link_af)(struct net_device *dev, const struct nlattr *attr, struct netlink_ext_ack *extack); int (*fill_stats_af)(struct sk_buff *skb, const struct net_device *dev); size_t (*get_stats_af_size)(const struct net_device *dev); }; void rtnl_af_register(struct rtnl_af_ops *ops); void rtnl_af_unregister(struct rtnl_af_ops *ops); struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]); struct net_device *rtnl_create_link(struct net *net, const char *ifname, unsigned char name_assign_type, const struct rtnl_link_ops *ops, struct nlattr *tb[], struct netlink_ext_ack *extack); int rtnl_delete_link(struct net_device *dev); int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm); int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, struct netlink_ext_ack *exterr); struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid); #define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind) #endif |
157 32 1 149 2 149 150 2 149 524 338 54 52 3 3 15 15 15 15 15 170 267 401 404 371 397 397 396 400 401 401 398 397 394 408 408 268 267 17 254 254 265 15 388 16 16 333 334 334 335 334 335 333 333 333 4 386 385 388 386 388 250 357 248 363 386 362 389 387 226 385 45 360 387 386 384 713 334 332 330 333 334 332 1 1 1 936 368 366 365 367 939 921 380 380 382 928 367 367 367 366 935 931 913 914 16 317 387 418 387 386 448 450 445 18 18 18 18 373 375 201 202 23 185 176 8 321 321 174 174 174 171 173 174 126 236 11 166 166 246 206 147 73 205 203 1 203 205 335 335 333 335 334 335 335 335 334 220 333 16 16 16 16 151 152 25 61 20 18 16 2 183 125 1 1 51 1 2 1 49 1 2 1 48 1 1 1 1 1 110 1 1 1 397 348 5 1 1 51 51 51 51 51 51 1 1 1 1 3 2 1 2 1 2 1 1 1 5 1 4 46 46 1 6 21 1 1 1 1 2 329 54 330 2 376 66 372 133 6 106 123 108 108 108 108 108 33 33 33 33 46 45 187 46 4 4 15 15 15 188 187 188 188 289 292 92 230 90 381 381 34 348 82 348 34 34 348 108 347 169 1 167 3 300 300 298 300 209 90 57 300 300 13 349 345 348 344 343 346 101 347 129 130 130 264 263 3 262 17 261 264 264 264 261 1 397 398 325 325 360 361 152 153 153 152 152 154 153 152 140 140 361 363 79 79 79 91 91 91 130 131 129 84 83 84 15 15 15 15 15 15 14 14 16 16 1 15 15 15 1 14 156 156 156 156 156 2 30 45 188 4 15 149 150 1 149 149 150 150 2 150 2 2 149 149 148 1 1 1 1 1 1 1 1 1 1 1 3 3 3 1 1 1 1 1 1 1 1 1 1 1 9 9 9 9 169 169 169 169 169 169 169 169 168 167 168 169 169 169 169 169 169 169 167 16 254 241 17 254 31 148 10 65 64 190 240 241 240 240 240 190 180 241 241 240 241 240 148 148 148 148 148 148 148 148 148 147 148 148 148 148 148 148 148 148 148 148 148 148 148 148 148 148 148 148 148 148 148 148 148 148 148 148 148 31 148 156 156 156 156 156 156 156 156 156 156 156 156 156 156 156 156 156 156 156 156 156 156 19 19 2 2 2 26 26 25 1 26 26 39 39 39 39 66 13 58 28 20 47 1 35 35 35 35 34 35 339 341 39 2 341 341 1 340 341 1 1 1 98 98 92 12 12 1 4 5 2 3 172 32 58 58 30 40 4 103 51 1 1 108 108 107 103 51 1 1 1 15 15 1 1 14 1 6 7 8 6 6 6 6 6 207 205 2 1 1 10 10 10 10 375 374 210 326 375 377 375 10 10 10 11 4 12 12 11 11 388 385 15 10 1 344 208 49 380 3 379 386 388 388 64 4 63 3 3 3 3 3 83 1 82 82 55 55 97 97 173 171 384 384 173 98 1 1 302 343 341 402 357 194 39 2 70 118 1 286 285 286 388 22 4 3 7 325 340 380 63 13 385 386 388 386 386 332 351 325 378 387 380 171 377 9 386 388 385 387 388 386 388 385 383 14 13 1 385 384 381 387 11 381 14 381 14 14 14 14 148 148 148 148 148 148 147 1 148 148 148 148 31 148 148 110 110 111 111 298 299 115 75 75 75 75 75 75 75 73 75 75 75 75 75 75 75 65 65 65 65 65 75 115 115 40 75 40 75 75 115 115 115 114 109 6 112 3 22 1 1 1 20 20 20 5 17 13 477 475 10 6 5 10 10 9 9 9 1 11 11 192 190 4 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 | // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * This module enables machines with Intel VT-x extensions to run virtual * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> */ #include <linux/highmem.h> #include <linux/hrtimer.h> #include <linux/kernel.h> #include <linux/kvm_host.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/mod_devicetable.h> #include <linux/mm.h> #include <linux/objtool.h> #include <linux/sched.h> #include <linux/sched/smt.h> #include <linux/slab.h> #include <linux/tboot.h> #include <linux/trace_events.h> #include <linux/entry-kvm.h> #include <asm/apic.h> #include <asm/asm.h> #include <asm/cpu.h> #include <asm/cpu_device_id.h> #include <asm/debugreg.h> #include <asm/desc.h> #include <asm/fpu/internal.h> #include <asm/idtentry.h> #include <asm/io.h> #include <asm/irq_remapping.h> #include <asm/kexec.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/mshyperv.h> #include <asm/mwait.h> #include <asm/spec-ctrl.h> #include <asm/virtext.h> #include <asm/vmx.h> #include "capabilities.h" #include "cpuid.h" #include "evmcs.h" #include "hyperv.h" #include "kvm_onhyperv.h" #include "irq.h" #include "kvm_cache_regs.h" #include "lapic.h" #include "mmu.h" #include "nested.h" #include "pmu.h" #include "sgx.h" #include "trace.h" #include "vmcs.h" #include "vmcs12.h" #include "vmx.h" #include "x86.h" MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); #ifdef MODULE static const struct x86_cpu_id vmx_cpu_id[] = { X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); #endif bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); static bool __read_mostly enable_vnmi = 1; module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); bool __read_mostly flexpriority_enabled = 1; module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); bool __read_mostly enable_ept = 1; module_param_named(ept, enable_ept, bool, S_IRUGO); bool __read_mostly enable_unrestricted_guest = 1; module_param_named(unrestricted_guest, enable_unrestricted_guest, bool, S_IRUGO); bool __read_mostly enable_ept_ad_bits = 1; module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); static bool __read_mostly emulate_invalid_guest_state = true; module_param(emulate_invalid_guest_state, bool, S_IRUGO); static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); module_param(enable_apicv, bool, S_IRUGO); /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not * use VMX instructions. */ static bool __read_mostly nested = 1; module_param(nested, bool, S_IRUGO); bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); static bool __read_mostly dump_invalid_vmcs = 0; module_param(dump_invalid_vmcs, bool, 0644); #define MSR_BITMAP_MODE_X2APIC 1 #define MSR_BITMAP_MODE_X2APIC_APICV 2 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ static int __read_mostly cpu_preemption_timer_multi; static bool __read_mostly enable_preemption_timer = 1; #ifdef CONFIG_X86_64 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #endif extern bool __read_mostly allow_smaller_maxphyaddr; module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ RTIT_STATUS_BYTECNT)) /* * List of MSRs that can be directly passed to the guest. * In addition to these x2apic and PT MSRs are handled specially. */ static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { MSR_IA32_SPEC_CTRL, MSR_IA32_PRED_CMD, MSR_IA32_TSC, #ifdef CONFIG_X86_64 MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE, #endif MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_CORE_C1_RES, MSR_CORE_C3_RESIDENCY, MSR_CORE_C6_RESIDENCY, MSR_CORE_C7_RESIDENCY, }; /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive * executions of PAUSE in a loop. Also indicate if ple enabled. * According to test, this time is usually smaller than 128 cycles. * ple_window: upper bound on the amount of time a guest is allowed to execute * in a PAUSE loop. Tests indicate that most spinlocks are held for * less than 2^12 cycles * Time is measured based on a counter that runs at the same rate as the TSC, * refer SDM volume 3b section 21.6.13 & 22.1.3. */ static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; module_param(ple_gap, uint, 0444); static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, uint, 0444); /* Default doubles per-vcpu window every exit. */ static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; module_param(ple_window_grow, uint, 0444); /* Default resets per-vcpu window every exit to ple_window. */ static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; module_param(ple_window_shrink, uint, 0444); /* Default is to compute the maximum so we can never overflow. */ static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); /* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ int __read_mostly pt_mode = PT_MODE_SYSTEM; #ifdef CONFIG_BROKEN module_param(pt_mode, int, S_IRUGO); #endif static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); static DEFINE_MUTEX(vmx_l1d_flush_mutex); /* Storage for pre module init parameter parsing */ static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; static const struct { const char *option; bool for_parse; } vmentry_l1d_param[] = { [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, [VMENTER_L1D_FLUSH_COND] = {"cond", true}, [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, }; #define L1D_CACHE_ORDER 4 static void *vmx_l1d_flush_pages; /* Control for disabling CPU Fill buffer clear */ static bool __read_mostly vmx_fb_clear_ctrl_available; static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) { struct page *page; unsigned int i; if (!boot_cpu_has_bug(X86_BUG_L1TF)) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; return 0; } if (!enable_ept) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; return 0; } if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { u64 msr; rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; return 0; } } /* If set to auto use the default l1tf mitigation method */ if (l1tf == VMENTER_L1D_FLUSH_AUTO) { switch (l1tf_mitigation) { case L1TF_MITIGATION_OFF: l1tf = VMENTER_L1D_FLUSH_NEVER; break; case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: l1tf = VMENTER_L1D_FLUSH_COND; break; case L1TF_MITIGATION_FULL: case L1TF_MITIGATION_FULL_FORCE: l1tf = VMENTER_L1D_FLUSH_ALWAYS; break; } } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { l1tf = VMENTER_L1D_FLUSH_ALWAYS; } if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { /* * This allocation for vmx_l1d_flush_pages is not tied to a VM * lifetime and so should not be charged to a memcg. */ page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); if (!page) return -ENOMEM; vmx_l1d_flush_pages = page_address(page); /* * Initialize each page with a different pattern in * order to protect against KSM in the nested * virtualization case. */ for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, PAGE_SIZE); } } l1tf_vmx_mitigation = l1tf; if (l1tf != VMENTER_L1D_FLUSH_NEVER) static_branch_enable(&vmx_l1d_should_flush); else static_branch_disable(&vmx_l1d_should_flush); if (l1tf == VMENTER_L1D_FLUSH_COND) static_branch_enable(&vmx_l1d_flush_cond); else static_branch_disable(&vmx_l1d_flush_cond); return 0; } static int vmentry_l1d_flush_parse(const char *s) { unsigned int i; if (s) { for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { if (vmentry_l1d_param[i].for_parse && sysfs_streq(s, vmentry_l1d_param[i].option)) return i; } } return -EINVAL; } static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) { int l1tf, ret; l1tf = vmentry_l1d_flush_parse(s); if (l1tf < 0) return l1tf; if (!boot_cpu_has(X86_BUG_L1TF)) return 0; /* * Has vmx_init() run already? If not then this is the pre init * parameter parsing. In that case just store the value and let * vmx_init() do the proper setup after enable_ept has been * established. */ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { vmentry_l1d_flush_param = l1tf; return 0; } mutex_lock(&vmx_l1d_flush_mutex); ret = vmx_setup_l1d_flush(l1tf); mutex_unlock(&vmx_l1d_flush_mutex); return ret; } static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) { if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) return sprintf(s, "???\n"); return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); } static void vmx_setup_fb_clear_ctrl(void) { u64 msr; if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) && !boot_cpu_has_bug(X86_BUG_MDS) && !boot_cpu_has_bug(X86_BUG_TAA)) { rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); if (msr & ARCH_CAP_FB_CLEAR_CTRL) vmx_fb_clear_ctrl_available = true; } } static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) { u64 msr; if (!vmx->disable_fb_clear) return; msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); msr |= FB_CLEAR_DIS; native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); /* Cache the MSR value to avoid reading it later */ vmx->msr_ia32_mcu_opt_ctrl = msr; } static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) { if (!vmx->disable_fb_clear) return; vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); } static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && vmx_fb_clear_ctrl_available; /* * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS * at VMEntry. Skip the MSR read/write when a guest has no use case to * execute VERW. */ if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) vmx->disable_fb_clear = false; } static const struct kernel_param_ops vmentry_l1d_flush_ops = { .set = vmentry_l1d_flush_set, .get = vmentry_l1d_flush_get, }; module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); static u32 vmx_segment_access_rights(struct kvm_segment *var); void vmx_vmexit(void); #define vmx_insn_failed(fmt...) \ do { \ WARN_ONCE(1, fmt); \ pr_warn_ratelimited(fmt); \ } while (0) asmlinkage void vmread_error(unsigned long field, bool fault) { if (fault) kvm_spurious_fault(); else vmx_insn_failed("kvm: vmread failed: field=%lx\n", field); } noinline void vmwrite_error(unsigned long field, unsigned long value) { vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n", field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) { vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr); } noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) { vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr); } noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) { vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", ext, vpid, gva); } noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) { vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", ext, eptp, gpa); } static DEFINE_PER_CPU(struct vmcs *, vmxarea); DEFINE_PER_CPU(struct vmcs *, current_vmcs); /* * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. */ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); struct vmcs_config vmcs_config; struct vmx_capability vmx_capability; #define VMX_SEGMENT_FIELD(seg) \ [VCPU_SREG_##seg] = { \ .selector = GUEST_##seg##_SELECTOR, \ .base = GUEST_##seg##_BASE, \ .limit = GUEST_##seg##_LIMIT, \ .ar_bytes = GUEST_##seg##_AR_BYTES, \ } static const struct kvm_vmx_segment_field { unsigned selector; unsigned base; unsigned limit; unsigned ar_bytes; } kvm_vmx_segment_fields[] = { VMX_SEGMENT_FIELD(CS), VMX_SEGMENT_FIELD(DS), VMX_SEGMENT_FIELD(ES), VMX_SEGMENT_FIELD(FS), VMX_SEGMENT_FIELD(GS), VMX_SEGMENT_FIELD(SS), VMX_SEGMENT_FIELD(TR), VMX_SEGMENT_FIELD(LDTR), }; static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) { vmx->segment_cache.bitmask = 0; } static unsigned long host_idt_base; #if IS_ENABLED(CONFIG_HYPERV) static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) { struct hv_enlightened_vmcs *evmcs; struct hv_partition_assist_pg **p_hv_pa_pg = &to_kvm_hv(vcpu->kvm)->hv_pa_pg; /* * Synthetic VM-Exit is not enabled in current code and so All * evmcs in singe VM shares same assist page. */ if (!*p_hv_pa_pg) *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); if (!*p_hv_pa_pg) return -ENOMEM; evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; evmcs->partition_assist_page = __pa(*p_hv_pa_pg); evmcs->hv_vm_id = (unsigned long)vcpu->kvm; evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; return 0; } #endif /* IS_ENABLED(CONFIG_HYPERV) */ /* * Comment's format: document - errata name - stepping - processor name. * Refer from * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp */ static u32 vmx_preemption_cpu_tfms[] = { /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ 0x000206E6, /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ 0x00020652, /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ 0x00020655, /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ /* * 320767.pdf - AAP86 - B1 - * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile */ 0x000106E5, /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ 0x000106A0, /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ 0x000106A1, /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ 0x000106A4, /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 0x000106A5, /* Xeon E3-1220 V2 */ 0x000306A8, }; static inline bool cpu_has_broken_vmx_preemption_timer(void) { u32 eax = cpuid_eax(0x00000001), i; /* Clear the reserved bits */ eax &= ~(0x3U << 14 | 0xfU << 28); for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) if (eax == vmx_preemption_cpu_tfms[i]) return true; return false; } static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) { return flexpriority_enabled && lapic_in_kernel(vcpu); } static inline bool report_flexpriority(void) { return flexpriority_enabled; } static int possible_passthrough_msr_slot(u32 msr) { u32 i; for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) if (vmx_possible_passthrough_msrs[i] == msr) return i; return -ENOENT; } static bool is_valid_passthrough_msr(u32 msr) { bool r; switch (msr) { case 0x800 ... 0x8ff: /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ return true; case MSR_IA32_RTIT_STATUS: case MSR_IA32_RTIT_OUTPUT_BASE: case MSR_IA32_RTIT_OUTPUT_MASK: case MSR_IA32_RTIT_CR3_MATCH: case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ case MSR_LBR_SELECT: case MSR_LBR_TOS: case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31: case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31: case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31: case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ return true; } r = possible_passthrough_msr_slot(msr) != -ENOENT; WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); return r; } struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; i = kvm_find_user_return_msr(msr); if (i >= 0) return &vmx->guest_uret_msrs[i]; return NULL; } static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, struct vmx_uret_msr *msr, u64 data) { unsigned int slot = msr - vmx->guest_uret_msrs; int ret = 0; u64 old_msr_data = msr->data; msr->data = data; if (msr->load_into_hardware) { preempt_disable(); ret = kvm_set_user_return_msr(slot, msr->data, msr->mask); preempt_enable(); if (ret) msr->data = old_msr_data; } return ret; } #ifdef CONFIG_KEXEC_CORE static void crash_vmclear_local_loaded_vmcss(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), loaded_vmcss_on_cpu_link) vmcs_clear(v->vmcs); } #endif /* CONFIG_KEXEC_CORE */ static void __loaded_vmcs_clear(void *arg) { struct loaded_vmcs *loaded_vmcs = arg; int cpu = raw_smp_processor_id(); if (loaded_vmcs->cpu != cpu) return; /* vcpu migration can race with cpu offline */ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) per_cpu(current_vmcs, cpu) = NULL; vmcs_clear(loaded_vmcs->vmcs); if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) vmcs_clear(loaded_vmcs->shadow_vmcs); list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); /* * Ensure all writes to loaded_vmcs, including deleting it from its * current percpu list, complete before setting loaded_vmcs->vcpu to * -1, otherwise a different cpu can see vcpu == -1 first and add * loaded_vmcs to its percpu list before it's deleted from this cpu's * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). */ smp_wmb(); loaded_vmcs->cpu = -1; loaded_vmcs->launched = 0; } void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) { int cpu = loaded_vmcs->cpu; if (cpu != -1) smp_call_function_single(cpu, __loaded_vmcs_clear, loaded_vmcs, 1); } static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, unsigned field) { bool ret; u32 mask = 1 << (seg * SEG_FIELD_NR + field); if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); vmx->segment_cache.bitmask = 0; } ret = vmx->segment_cache.bitmask & mask; vmx->segment_cache.bitmask |= mask; return ret; } static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) { u16 *p = &vmx->segment_cache.seg[seg].selector; if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); return *p; } static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) { ulong *p = &vmx->segment_cache.seg[seg].base; if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); return *p; } static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) { u32 *p = &vmx->segment_cache.seg[seg].limit; if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); return *p; } static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) { u32 *p = &vmx->segment_cache.seg[seg].ar; if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); return *p; } void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); /* * Guest access to VMware backdoor ports could legitimately * trigger #GP because of TSS I/O permission bitmap. * We intercept those #GP and allow access to them anyway * as VMware does. */ if (enable_vmware_backdoor) eb |= (1u << GP_VECTOR); if ((vcpu->guest_debug & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (!vmx_need_pf_intercept(vcpu)) eb &= ~(1u << PF_VECTOR); /* When we are running a nested L2 guest and L1 specified for it a * certain exception bitmap, we must trap the same exceptions and pass * them to L1. When running L2, we will only handle the exceptions * specified above if L1 did not want them. */ if (is_guest_mode(vcpu)) eb |= get_vmcs12(vcpu)->exception_bitmap; else { int mask = 0, match = 0; if (enable_ept && (eb & (1u << PF_VECTOR))) { /* * If EPT is enabled, #PF is currently only intercepted * if MAXPHYADDR is smaller on the guest than on the * host. In that case we only care about present, * non-reserved faults. For vmcs02, however, PFEC_MASK * and PFEC_MATCH are set in prepare_vmcs02_rare. */ mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; match = PFERR_PRESENT_MASK; } vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); } vmcs_write32(EXCEPTION_BITMAP, eb); } /* * Check if MSR is intercepted for currently loaded MSR bitmap. */ static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) { if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) return true; return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); } unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) { unsigned int flags = 0; if (vmx->loaded_vmcs->launched) flags |= VMX_RUN_VMRESUME; /* * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free * to change it directly without causing a vmexit. In that case read * it after vmexit and store it in vmx->spec_ctrl. */ if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) flags |= VMX_RUN_SAVE_SPEC_CTRL; return flags; } static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, unsigned long entry, unsigned long exit) { vm_entry_controls_clearbit(vmx, entry); vm_exit_controls_clearbit(vmx, exit); } int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) { unsigned int i; for (i = 0; i < m->nr; ++i) { if (m->val[i].index == msr) return i; } return -ENOENT; } static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) { int i; struct msr_autoload *m = &vmx->msr_autoload; switch (msr) { case MSR_EFER: if (cpu_has_load_ia32_efer()) { clear_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER); return; } break; case MSR_CORE_PERF_GLOBAL_CTRL: if (cpu_has_load_perf_global_ctrl()) { clear_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); return; } break; } i = vmx_find_loadstore_msr_slot(&m->guest, msr); if (i < 0) goto skip_guest; --m->guest.nr; m->guest.val[i] = m->guest.val[m->guest.nr]; vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); skip_guest: i = vmx_find_loadstore_msr_slot(&m->host, msr); if (i < 0) return; --m->host.nr; m->host.val[i] = m->host.val[m->host.nr]; vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); } static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, unsigned long entry, unsigned long exit, unsigned long guest_val_vmcs, unsigned long host_val_vmcs, u64 guest_val, u64 host_val) { vmcs_write64(guest_val_vmcs, guest_val); if (host_val_vmcs != HOST_IA32_EFER) vmcs_write64(host_val_vmcs, host_val); vm_entry_controls_setbit(vmx, entry); vm_exit_controls_setbit(vmx, exit); } static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, u64 guest_val, u64 host_val, bool entry_only) { int i, j = 0; struct msr_autoload *m = &vmx->msr_autoload; switch (msr) { case MSR_EFER: if (cpu_has_load_ia32_efer()) { add_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER, GUEST_IA32_EFER, HOST_IA32_EFER, guest_val, host_val); return; } break; case MSR_CORE_PERF_GLOBAL_CTRL: if (cpu_has_load_perf_global_ctrl()) { add_atomic_switch_msr_special(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, GUEST_IA32_PERF_GLOBAL_CTRL, HOST_IA32_PERF_GLOBAL_CTRL, guest_val, host_val); return; } break; case MSR_IA32_PEBS_ENABLE: /* PEBS needs a quiescent period after being disabled (to write * a record). Disabling PEBS through VMX MSR swapping doesn't * provide that period, so a CPU could write host's record into * guest's memory. */ wrmsrl(MSR_IA32_PEBS_ENABLE, 0); } i = vmx_find_loadstore_msr_slot(&m->guest, msr); if (!entry_only) j = vmx_find_loadstore_msr_slot(&m->host, msr); if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; } if (i < 0) { i = m->guest.nr++; vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); } m->guest.val[i].index = msr; m->guest.val[i].value = guest_val; if (entry_only) return; if (j < 0) { j = m->host.nr++; vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); } m->host.val[j].index = msr; m->host.val[j].value = host_val; } static bool update_transition_efer(struct vcpu_vmx *vmx) { u64 guest_efer = vmx->vcpu.arch.efer; u64 ignore_bits = 0; int i; /* Shadow paging assumes NX to be available. */ if (!enable_ept) guest_efer |= EFER_NX; /* * LMA and LME handled by hardware; SCE meaningless outside long mode. */ ignore_bits |= EFER_SCE; #ifdef CONFIG_X86_64 ignore_bits |= EFER_LMA | EFER_LME; /* SCE is meaningful only in long mode on Intel */ if (guest_efer & EFER_LMA) ignore_bits &= ~(u64)EFER_SCE; #endif /* * On EPT, we can't emulate NX, so we must switch EFER atomically. * On CPUs that support "load IA32_EFER", always switch EFER * atomically, since it's faster than switching it manually. */ if (cpu_has_load_ia32_efer() || (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { if (!(guest_efer & EFER_LMA)) guest_efer &= ~EFER_LME; if (guest_efer != host_efer) add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer, false); else clear_atomic_switch_msr(vmx, MSR_EFER); return false; } i = kvm_find_user_return_msr(MSR_EFER); if (i < 0) return false; clear_atomic_switch_msr(vmx, MSR_EFER); guest_efer &= ~ignore_bits; guest_efer |= host_efer & ignore_bits; vmx->guest_uret_msrs[i].data = guest_efer; vmx->guest_uret_msrs[i].mask = ~ignore_bits; return true; } #ifdef CONFIG_X86_32 /* * On 32-bit kernels, VM exits still load the FS and GS bases from the * VMCS rather than the segment table. KVM uses this helper to figure * out the current bases to poke them into the VMCS before entry. */ static unsigned long segment_base(u16 selector) { struct desc_struct *table; unsigned long v; if (!(selector & ~SEGMENT_RPL_MASK)) return 0; table = get_current_gdt_ro(); if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { u16 ldt_selector = kvm_read_ldt(); if (!(ldt_selector & ~SEGMENT_RPL_MASK)) return 0; table = (struct desc_struct *)segment_base(ldt_selector); } v = get_desc_base(&table[selector >> 3]); return v; } #endif static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) { return vmx_pt_mode_is_host_guest() && !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); } static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) { /* The base must be 128-byte aligned and a legal physical address. */ return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); } static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); for (i = 0; i < addr_range; i++) { wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); } } static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); for (i = 0; i < addr_range; i++) { rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); } } static void pt_guest_enter(struct vcpu_vmx *vmx) { if (vmx_pt_mode_is_system()) return; /* * GUEST_IA32_RTIT_CTL is already set in the VMCS. * Save host state before VM entry. */ rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { wrmsrl(MSR_IA32_RTIT_CTL, 0); pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range); } } static void pt_guest_exit(struct vcpu_vmx *vmx) { if (vmx_pt_mode_is_system()) return; if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range); pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); } /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */ wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, unsigned long fs_base, unsigned long gs_base) { if (unlikely(fs_sel != host->fs_sel)) { if (!(fs_sel & 7)) vmcs_write16(HOST_FS_SELECTOR, fs_sel); else vmcs_write16(HOST_FS_SELECTOR, 0); host->fs_sel = fs_sel; } if (unlikely(gs_sel != host->gs_sel)) { if (!(gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, gs_sel); else vmcs_write16(HOST_GS_SELECTOR, 0); host->gs_sel = gs_sel; } if (unlikely(fs_base != host->fs_base)) { vmcs_writel(HOST_FS_BASE, fs_base); host->fs_base = fs_base; } if (unlikely(gs_base != host->gs_base)) { vmcs_writel(HOST_GS_BASE, gs_base); host->gs_base = gs_base; } } void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs_host_state *host_state; #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); #endif unsigned long fs_base, gs_base; u16 fs_sel, gs_sel; int i; vmx->req_immediate_exit = false; /* * Note that guest MSRs to be saved/restored can also be changed * when guest state is loaded. This happens when guest transitions * to/from long-mode by setting MSR_EFER.LMA. */ if (!vmx->guest_uret_msrs_loaded) { vmx->guest_uret_msrs_loaded = true; for (i = 0; i < kvm_nr_uret_msrs; ++i) { if (!vmx->guest_uret_msrs[i].load_into_hardware) continue; kvm_set_user_return_msr(i, vmx->guest_uret_msrs[i].data, vmx->guest_uret_msrs[i].mask); } } if (vmx->nested.need_vmcs12_to_shadow_sync) nested_sync_vmcs12_to_shadow(vcpu); if (vmx->guest_state_loaded) return; host_state = &vmx->loaded_vmcs->host_state; /* * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. */ host_state->ldt_sel = kvm_read_ldt(); #ifdef CONFIG_X86_64 savesegment(ds, host_state->ds_sel); savesegment(es, host_state->es_sel); gs_base = cpu_kernelmode_gs_base(cpu); if (likely(is_64bit_mm(current->mm))) { current_save_fsgs(); fs_sel = current->thread.fsindex; gs_sel = current->thread.gsindex; fs_base = current->thread.fsbase; vmx->msr_host_kernel_gs_base = current->thread.gsbase; } else { savesegment(fs, fs_sel); savesegment(gs, gs_sel); fs_base = read_msr(MSR_FS_BASE); vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #else savesegment(fs, fs_sel); savesegment(gs, gs_sel); fs_base = segment_base(fs_sel); gs_base = segment_base(gs_sel); #endif vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); vmx->guest_state_loaded = true; } static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { struct vmcs_host_state *host_state; if (!vmx->guest_state_loaded) return; host_state = &vmx->loaded_vmcs->host_state; ++vmx->vcpu.stat.host_state_reload; #ifdef CONFIG_X86_64 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif if (host_state->ldt_sel || (host_state->gs_sel & 7)) { kvm_load_ldt(host_state->ldt_sel); #ifdef CONFIG_X86_64 load_gs_index(host_state->gs_sel); #else loadsegment(gs, host_state->gs_sel); #endif } if (host_state->fs_sel & 7) loadsegment(fs, host_state->fs_sel); #ifdef CONFIG_X86_64 if (unlikely(host_state->ds_sel | host_state->es_sel)) { loadsegment(ds, host_state->ds_sel); loadsegment(es, host_state->es_sel); } #endif invalidate_tss_limit(); #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif load_fixmap_gdt(raw_smp_processor_id()); vmx->guest_state_loaded = false; vmx->guest_uret_msrs_loaded = false; } #ifdef CONFIG_X86_64 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { preempt_disable(); if (vmx->guest_state_loaded) rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); preempt_enable(); return vmx->msr_guest_kernel_gs_base; } static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { preempt_disable(); if (vmx->guest_state_loaded) wrmsrl(MSR_KERNEL_GS_BASE, data); preempt_enable(); vmx->msr_guest_kernel_gs_base = data; } #endif void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, struct loaded_vmcs *buddy) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; struct vmcs *prev; if (!already_loaded) { loaded_vmcs_clear(vmx->loaded_vmcs); local_irq_disable(); /* * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to * this cpu's percpu list, otherwise it may not yet be deleted * from its previous cpu's percpu list. Pairs with the * smb_wmb() in __loaded_vmcs_clear(). */ smp_rmb(); list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, &per_cpu(loaded_vmcss_on_cpu, cpu)); local_irq_enable(); } prev = per_cpu(current_vmcs, cpu); if (prev != vmx->loaded_vmcs->vmcs) { per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; vmcs_load(vmx->loaded_vmcs->vmcs); /* * No indirect branch prediction barrier needed when switching * the active VMCS within a vCPU, unless IBRS is advertised to * the vCPU. To minimize the number of IBPBs executed, KVM * performs IBPB on nested VM-Exit (a single nested transition * may switch the active VMCS multiple times). */ if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) indirect_branch_prediction_barrier(); } if (!already_loaded) { void *gdt = get_current_gdt_ro(); unsigned long sysenter_esp; /* * Flush all EPTP/VPID contexts, the new pCPU may have stale * TLB entries from its previous association with the vCPU. */ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); /* * Linux uses per-cpu TSS and GDT, so set these when switching * processors. See 22.2.4. */ vmcs_writel(HOST_TR_BASE, (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ vmx->loaded_vmcs->cpu = cpu; } } /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. */ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); vmx_vcpu_load_vmcs(vcpu, cpu, NULL); vmx_vcpu_pi_load(vcpu, cpu); vmx->host_debugctlmsr = get_debugctlmsr(); } static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); vmx_prepare_switch_to_host(to_vmx(vcpu)); } bool vmx_emulation_required(struct kvm_vcpu *vcpu) { return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); } unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long rflags, save_rflags; if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); rflags = vmcs_readl(GUEST_RFLAGS); if (vmx->rmode.vm86_active) { rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; save_rflags = vmx->rmode.save_rflags; rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; } vmx->rflags = rflags; } return vmx->rflags; } void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long old_rflags; /* * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU * is an unrestricted guest in order to mark L2 as needing emulation * if L1 runs L2 as a restricted guest. */ if (is_unrestricted_guest(vcpu)) { kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); vmx->rflags = rflags; vmcs_writel(GUEST_RFLAGS, rflags); return; } old_rflags = vmx_get_rflags(vcpu); vmx->rflags = rflags; if (vmx->rmode.vm86_active) { vmx->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; } vmcs_writel(GUEST_RFLAGS, rflags); if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) vmx->emulation_required = vmx_emulation_required(vcpu); } static bool vmx_get_if_flag(struct kvm_vcpu *vcpu) { return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; } u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); int ret = 0; if (interruptibility & GUEST_INTR_STATE_STI) ret |= KVM_X86_SHADOW_INT_STI; if (interruptibility & GUEST_INTR_STATE_MOV_SS) ret |= KVM_X86_SHADOW_INT_MOV_SS; return ret; } void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) { u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); u32 interruptibility = interruptibility_old; interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); if (mask & KVM_X86_SHADOW_INT_MOV_SS) interruptibility |= GUEST_INTR_STATE_MOV_SS; else if (mask & KVM_X86_SHADOW_INT_STI) interruptibility |= GUEST_INTR_STATE_STI; if ((interruptibility != interruptibility_old)) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); } static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long value; /* * Any MSR write that attempts to change bits marked reserved will * case a #GP fault. */ if (data & vmx->pt_desc.ctl_bitmask) return 1; /* * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will * result in a #GP unless the same write also clears TraceEn. */ if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) return 1; /* * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit * and FabricEn would cause #GP, if * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 */ if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && !(data & RTIT_CTL_FABRIC_EN) && !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_single_range_output)) return 1; /* * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that * utilize encodings marked reserved will cause a #GP fault. */ value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && !test_bit((data & RTIT_CTL_MTC_RANGE) >> RTIT_CTL_MTC_RANGE_OFFSET, &value)) return 1; value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cycle_thresholds); if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && !test_bit((data & RTIT_CTL_CYC_THRESH) >> RTIT_CTL_CYC_THRESH_OFFSET, &value)) return 1; value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && !test_bit((data & RTIT_CTL_PSB_FREQ) >> RTIT_CTL_PSB_FREQ_OFFSET, &value)) return 1; /* * If ADDRx_CFG is reserved or the encodings is >2 will * cause a #GP fault. */ value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2)) return 1; value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2)) return 1; value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2)) return 1; value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2)) return 1; return 0; } static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) { /* * Emulation of instructions in SGX enclaves is impossible as RIP does * not point tthe failing instruction, and even if it did, the code * stream is inaccessible. Inject #UD instead of exiting to userspace * so that guest userspace can't DoS the guest simply by triggering * emulation (enclaves are CPL3 only). */ if (to_vmx(vcpu)->exit_reason.enclave_mode) { kvm_queue_exception(vcpu, UD_VECTOR); return false; } return true; } static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; unsigned long rip, orig_rip; u32 instr_len; /* * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on * undefined behavior: Intel's SDM doesn't mandate the VMCS field be * set when EPT misconfig occurs. In practice, real hardware updates * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors * (namely Hyper-V) don't set it due to it being undefined behavior, * i.e. we end up advancing IP with some random value. */ if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); /* * Emulating an enclave's instructions isn't supported as KVM * cannot access the enclave's memory or its true RIP, e.g. the * vmcs.GUEST_RIP points at the exit point of the enclave, not * the RIP that actually triggered the VM-Exit. But, because * most instructions that cause VM-Exit will #UD in an enclave, * most instruction-based VM-Exits simply do not occur. * * There are a few exceptions, notably the debug instructions * INT1ICEBRK and INT3, as they are allowed in debug enclaves * and generate #DB/#BP as expected, which KVM might intercept. * But again, the CPU does the dirty work and saves an instr * length of zero so VMMs don't shoot themselves in the foot. * WARN if KVM tries to skip a non-zero length instruction on * a VM-Exit from an enclave. */ if (!instr_len) goto rip_updated; WARN(exit_reason.enclave_mode, "KVM: skipping instruction after SGX enclave VM-Exit"); orig_rip = kvm_rip_read(vcpu); rip = orig_rip + instr_len; #ifdef CONFIG_X86_64 /* * We need to mask out the high 32 bits of RIP if not in 64-bit * mode, but just finding out that we are in 64-bit mode is * quite expensive. Only do it if there was a carry. */ if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) rip = (u32)rip; #endif kvm_rip_write(vcpu, rip); } else { if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) return 0; } rip_updated: /* skipping an emulated instruction also counts */ vmx_set_interrupt_shadow(vcpu, 0); return 1; } /* * Recognizes a pending MTF VM-exit and records the nested state for later * delivery. */ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); if (!is_guest_mode(vcpu)) return; /* * Per the SDM, MTF takes priority over debug-trap exceptions besides * T-bit traps. As instruction emulation is completed (i.e. at the * instruction boundary), any #DB exception pending delivery must be a * debug-trap. Record the pending MTF state to be delivered in * vmx_check_nested_events(). */ if (nested_cpu_has_mtf(vmcs12) && (!vcpu->arch.exception.pending || vcpu->arch.exception.nr == DB_VECTOR)) vmx->nested.mtf_pending = true; else vmx->nested.mtf_pending = false; } static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) { vmx_update_emulated_instruction(vcpu); return skip_emulated_instruction(vcpu); } static void vmx_clear_hlt(struct kvm_vcpu *vcpu) { /* * Ensure that we clear the HLT state in the VMCS. We don't need to * explicitly skip the instruction because if the HLT state is set, * then the instruction is already executing and RIP has already been * advanced. */ if (kvm_hlt_in_guest(vcpu->kvm) && vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); } static void vmx_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned nr = vcpu->arch.exception.nr; bool has_error_code = vcpu->arch.exception.has_error_code; u32 error_code = vcpu->arch.exception.error_code; u32 intr_info = nr | INTR_INFO_VALID_MASK; kvm_deliver_exception_payload(vcpu); if (has_error_code) { /* * Despite the error code being architecturally defined as 32 * bits, and the VMCS field being 32 bits, Intel CPUs and thus * VMX don't actually supporting setting bits 31:16. Hardware * will (should) never provide a bogus error code, but AMD CPUs * do generate error codes with bits 31:16 set, and so KVM's * ABI lets userspace shove in arbitrary 32-bit values. Drop * the upper bits to avoid VM-Fail, losing information that * does't really exist is preferable to killing the VM. */ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)error_code); intr_info |= INTR_INFO_DELIVER_CODE_MASK; } if (vmx->rmode.vm86_active) { int inc_eip = 0; if (kvm_exception_is_soft(nr)) inc_eip = vcpu->arch.event_exit_inst_len; kvm_inject_realmode_interrupt(vcpu, nr, inc_eip); return; } WARN_ON_ONCE(vmx->emulation_required); if (kvm_exception_is_soft(nr)) { vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmx->vcpu.arch.event_exit_inst_len); intr_info |= INTR_TYPE_SOFT_EXCEPTION; } else intr_info |= INTR_TYPE_HARD_EXCEPTION; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); vmx_clear_hlt(vcpu); } static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, bool load_into_hardware) { struct vmx_uret_msr *uret_msr; uret_msr = vmx_find_uret_msr(vmx, msr); if (!uret_msr) return; uret_msr->load_into_hardware = load_into_hardware; } /* * Configuring user return MSRs to automatically save, load, and restore MSRs * that need to be shoved into hardware when running the guest. Note, omitting * an MSR here does _NOT_ mean it's not emulated, only that it will not be * loaded into hardware when running the guest. */ static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) { #ifdef CONFIG_X86_64 bool load_syscall_msrs; /* * The SYSCALL MSRs are only needed on long mode guests, and only * when EFER.SCE is set. */ load_syscall_msrs = is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE); vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); #endif vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); vmx_setup_uret_msr(vmx, MSR_TSC_AUX, guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID)); /* * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new * kernel and old userspace. If those guests run on a tsx=off host, do * allow guests to use TSX_CTRL, but don't change the value in hardware * so that TSX remains always disabled. */ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); /* * The set of MSRs to load may have changed, reload MSRs before the * next VM-Enter. */ vmx->guest_uret_msrs_loaded = false; } u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) return vmcs12->tsc_offset; return 0; } u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) return vmcs12->tsc_multiplier; return kvm_default_tsc_scaling_ratio; } static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { vmcs_write64(TSC_OFFSET, offset); } static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) { vmcs_write64(TSC_MULTIPLIER, multiplier); } /* * nested_vmx_allowed() checks whether a guest should be allowed to use VMX * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for * all guests if the "nested" module option is off, and can also be disabled * for a single guest by disabling its VMX cpuid bit. */ bool nested_vmx_allowed(struct kvm_vcpu *vcpu) { return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); } static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, uint64_t val) { uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; return !(val & ~valid_bits); } static int vmx_get_msr_feature(struct kvm_msr_entry *msr) { switch (msr->index) { case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested) return 1; return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); case MSR_IA32_PERF_CAPABILITIES: msr->data = vmx_get_perf_capabilities(); return 0; default: return KVM_MSR_RET_INVALID; } } /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr; u32 index; switch (msr_info->index) { #ifdef CONFIG_X86_64 case MSR_FS_BASE: msr_info->data = vmcs_readl(GUEST_FS_BASE); break; case MSR_GS_BASE: msr_info->data = vmcs_readl(GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE: msr_info->data = vmx_read_guest_kernel_gs_base(vmx); break; #endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_info); case MSR_IA32_TSX_CTRL: if (!msr_info->host_initiated && !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) return 1; goto find_uret_msr; case MSR_IA32_UMWAIT_CONTROL: if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) return 1; msr_info->data = vmx->msr_ia32_umwait_control; break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && !guest_has_spec_ctrl_msr(vcpu)) return 1; msr_info->data = to_vmx(vcpu)->spec_ctrl; break; case MSR_IA32_SYSENTER_CS: msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); break; case MSR_IA32_SYSENTER_EIP: msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); break; case MSR_IA32_SYSENTER_ESP: msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; case MSR_IA32_MCG_EXT_CTL: if (!msr_info->host_initiated && !(vmx->msr_ia32_feature_control & FEAT_CTL_LMCE_ENABLED)) return 1; msr_info->data = vcpu->arch.mcg_ext_ctl; break; case MSR_IA32_FEAT_CTL: msr_info->data = vmx->msr_ia32_feature_control; break; case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) return 1; msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) return 1; if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, &msr_info->data)) return 1; /* * Enlightened VMCS v1 doesn't have certain VMCS fields but * instead of just ignoring the features, different Hyper-V * versions are either trying to use them and fail or do some * sanity checking and refuse to boot. Filter all unsupported * features out. */ if (!msr_info->host_initiated && vmx->nested.enlightened_vmcs_enabled) nested_evmcs_filter_control_msr(msr_info->index, &msr_info->data); break; case MSR_IA32_RTIT_CTL: if (!vmx_pt_mode_is_host_guest()) return 1; msr_info->data = vmx->pt_desc.guest.ctl; break; case MSR_IA32_RTIT_STATUS: if (!vmx_pt_mode_is_host_guest()) return 1; msr_info->data = vmx->pt_desc.guest.status; break; case MSR_IA32_RTIT_CR3_MATCH: if (!vmx_pt_mode_is_host_guest() || !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) return 1; msr_info->data = vmx->pt_desc.guest.cr3_match; break; case MSR_IA32_RTIT_OUTPUT_BASE: if (!vmx_pt_mode_is_host_guest() || (!intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output) && !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_single_range_output))) return 1; msr_info->data = vmx->pt_desc.guest.output_base; break; case MSR_IA32_RTIT_OUTPUT_MASK: if (!vmx_pt_mode_is_host_guest() || (!intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output) && !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_single_range_output))) return 1; msr_info->data = vmx->pt_desc.guest.output_mask; break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; if (!vmx_pt_mode_is_host_guest() || (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_num_address_ranges))) return 1; if (index % 2) msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; else msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; break; case MSR_IA32_DEBUGCTLMSR: msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); break; default: find_uret_msr: msr = vmx_find_uret_msr(vmx, msr_info->index); if (msr) { msr_info->data = msr->data; break; } return kvm_get_msr_common(vcpu, msr_info); } return 0; } static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, u64 data) { #ifdef CONFIG_X86_64 if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) return (u32)data; #endif return (unsigned long)data; } static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu) { u64 debugctl = vmx_supported_debugctl(); if (!intel_pmu_lbr_is_enabled(vcpu)) debugctl &= ~DEBUGCTLMSR_LBR_MASK; if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; return debugctl; } /* * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr; int ret = 0; u32 msr_index = msr_info->index; u64 data = msr_info->data; u32 index; switch (msr_index) { case MSR_EFER: ret = kvm_set_msr_common(vcpu, msr_info); break; #ifdef CONFIG_X86_64 case MSR_FS_BASE: vmx_segment_cache_clear(vmx); vmcs_writel(GUEST_FS_BASE, data); break; case MSR_GS_BASE: vmx_segment_cache_clear(vmx); vmcs_writel(GUEST_GS_BASE, data); break; case MSR_KERNEL_GS_BASE: vmx_write_guest_kernel_gs_base(vmx, data); break; #endif case MSR_IA32_SYSENTER_CS: if (is_guest_mode(vcpu)) get_vmcs12(vcpu)->guest_sysenter_cs = data; vmcs_write32(GUEST_SYSENTER_CS, data); break; case MSR_IA32_SYSENTER_EIP: if (is_guest_mode(vcpu)) { data = nested_vmx_truncate_sysenter_addr(vcpu, data); get_vmcs12(vcpu)->guest_sysenter_eip = data; } vmcs_writel(GUEST_SYSENTER_EIP, data); break; case MSR_IA32_SYSENTER_ESP: if (is_guest_mode(vcpu)) { data = nested_vmx_truncate_sysenter_addr(vcpu, data); get_vmcs12(vcpu)->guest_sysenter_esp = data; } vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_DEBUGCTLMSR: { u64 invalid = data & ~vcpu_supported_debugctl(vcpu); if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { if (report_ignored_msrs) vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", __func__, data); data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); } if (invalid) return 1; if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) get_vmcs12(vcpu)->guest_ia32_debugctl = data; vmcs_write64(GUEST_IA32_DEBUGCTL, data); if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && (data & DEBUGCTLMSR_LBR)) intel_pmu_create_guest_lbr_event(vcpu); return 0; } case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) return 1; if (is_noncanonical_address(data & PAGE_MASK, vcpu) || (data & MSR_IA32_BNDCFGS_RSVD)) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; case MSR_IA32_UMWAIT_CONTROL: if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) return 1; /* The reserved bit 1 and non-32 bit [63:32] should be zero */ if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) return 1; vmx->msr_ia32_umwait_control = data; break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && !guest_has_spec_ctrl_msr(vcpu)) return 1; if (kvm_spec_ctrl_test_value(data)) return 1; vmx->spec_ctrl = data; if (!data) break; /* * For non-nested: * When it's written (to non-zero) for the first time, pass * it through. * * For nested: * The handling of the MSR bitmap for L2 guests is done in * nested_vmx_prepare_msr_bitmap. We should not touch the * vmcs02.msr_bitmap here since it gets completely overwritten * in the merging. We update the vmcs01 here for L1 as well * since it will end up touching the MSR anyway now. */ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); break; case MSR_IA32_TSX_CTRL: if (!msr_info->host_initiated && !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) return 1; if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) return 1; goto find_uret_msr; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu)) return 1; if (data & ~PRED_CMD_IBPB) return 1; if (!boot_cpu_has(X86_FEATURE_IBPB)) return 1; if (!data) break; wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); /* * For non-nested: * When it's written (to non-zero) for the first time, pass * it through. * * For nested: * The handling of the MSR bitmap for L2 guests is done in * nested_vmx_prepare_msr_bitmap. We should not touch the * vmcs02.msr_bitmap here since it gets completely overwritten * in the merging. */ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W); break; case MSR_IA32_CR_PAT: if (!kvm_pat_valid(data)) return 1; if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) get_vmcs12(vcpu)->guest_ia32_pat = data; if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, data); vcpu->arch.pat = data; break; } ret = kvm_set_msr_common(vcpu, msr_info); break; case MSR_IA32_TSC_ADJUST: ret = kvm_set_msr_common(vcpu, msr_info); break; case MSR_IA32_MCG_EXT_CTL: if ((!msr_info->host_initiated && !(to_vmx(vcpu)->msr_ia32_feature_control & FEAT_CTL_LMCE_ENABLED)) || (data & ~MCG_EXT_CTL_LMCE_EN)) return 1; vcpu->arch.mcg_ext_ctl = data; break; case MSR_IA32_FEAT_CTL: if (!vmx_feature_control_msr_valid(vcpu, data) || (to_vmx(vcpu)->msr_ia32_feature_control & FEAT_CTL_LOCKED && !msr_info->host_initiated)) return 1; vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); /* SGX may be enabled/disabled by guest's firmware */ vmx_write_encls_bitmap(vcpu, NULL); break; case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: /* * On real hardware, the LE hash MSRs are writable before * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), * at which point SGX related bits in IA32_FEATURE_CONTROL * become writable. * * KVM does not emulate SGX activation for simplicity, so * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL * is unlocked. This is technically not architectural * behavior, but it's close enough. */ if (!msr_info->host_initiated && (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) || ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) return 1; vmx->msr_ia32_sgxlepubkeyhash [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!msr_info->host_initiated) return 1; /* they are read-only */ if (!nested_vmx_allowed(vcpu)) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_RTIT_CTL: if (!vmx_pt_mode_is_host_guest() || vmx_rtit_ctl_check(vcpu, data) || vmx->nested.vmxon) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); vmx->pt_desc.guest.ctl = data; pt_update_intercept_for_msr(vcpu); break; case MSR_IA32_RTIT_STATUS: if (!pt_can_write_msr(vmx)) return 1; if (data & MSR_IA32_RTIT_STATUS_MASK) return 1; vmx->pt_desc.guest.status = data; break; case MSR_IA32_RTIT_CR3_MATCH: if (!pt_can_write_msr(vmx)) return 1; if (!intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) return 1; vmx->pt_desc.guest.cr3_match = data; break; case MSR_IA32_RTIT_OUTPUT_BASE: if (!pt_can_write_msr(vmx)) return 1; if (!intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output) && !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_single_range_output)) return 1; if (!pt_output_base_valid(vcpu, data)) return 1; vmx->pt_desc.guest.output_base = data; break; case MSR_IA32_RTIT_OUTPUT_MASK: if (!pt_can_write_msr(vmx)) return 1; if (!intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output) && !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_single_range_output)) return 1; vmx->pt_desc.guest.output_mask = data; break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: if (!pt_can_write_msr(vmx)) return 1; index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_num_address_ranges)) return 1; if (is_noncanonical_address(data, vcpu)) return 1; if (index % 2) vmx->pt_desc.guest.addr_b[index / 2] = data; else vmx->pt_desc.guest.addr_a[index / 2] = data; break; case MSR_IA32_PERF_CAPABILITIES: if (data && !vcpu_to_pmu(vcpu)->version) return 1; if (data & PMU_CAP_LBR_FMT) { if ((data & PMU_CAP_LBR_FMT) != (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) return 1; if (!intel_pmu_lbr_is_compatible(vcpu)) return 1; } ret = kvm_set_msr_common(vcpu, msr_info); break; default: find_uret_msr: msr = vmx_find_uret_msr(vmx, msr_index); if (msr) ret = vmx_set_guest_uret_msr(vmx, msr, data); else ret = kvm_set_msr_common(vcpu, msr_info); } /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ if (msr_index == MSR_IA32_ARCH_CAPABILITIES) vmx_update_fb_clear_dis(vcpu, vmx); return ret; } static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { unsigned long guest_owned_bits; kvm_register_mark_available(vcpu, reg); switch (reg) { case VCPU_REGS_RSP: vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); break; case VCPU_REGS_RIP: vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); break; case VCPU_EXREG_PDPTR: if (enable_ept) ept_save_pdptrs(vcpu); break; case VCPU_EXREG_CR0: guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; vcpu->arch.cr0 &= ~guest_owned_bits; vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; break; case VCPU_EXREG_CR3: /* * When intercepting CR3 loads, e.g. for shadowing paging, KVM's * CR3 is loaded into hardware, not the guest's CR3. */ if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; case VCPU_EXREG_CR4: guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; vcpu->arch.cr4 &= ~guest_owned_bits; vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; break; default: KVM_BUG_ON(1, vcpu->kvm); break; } } static __init int cpu_has_kvm_support(void) { return cpu_has_vmx(); } static __init int vmx_disabled_by_bios(void) { return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || !boot_cpu_has(X86_FEATURE_VMX); } static int kvm_cpu_vmxon(u64 vmxon_pointer) { u64 msr; cr4_set_bits(X86_CR4_VMXE); asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" _ASM_EXTABLE(1b, %l[fault]) : : [vmxon_pointer] "m"(vmxon_pointer) : : fault); return 0; fault: WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); cr4_clear_bits(X86_CR4_VMXE); return -EFAULT; } static int hardware_enable(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); int r; if (cr4_read_shadow() & X86_CR4_VMXE) return -EBUSY; /* * This can happen if we hot-added a CPU but failed to allocate * VP assist page for it. */ if (static_branch_unlikely(&enable_evmcs) && !hv_get_vp_assist_page(cpu)) return -EFAULT; intel_pt_handle_vmx(1); r = kvm_cpu_vmxon(phys_addr); if (r) { intel_pt_handle_vmx(0); return r; } if (enable_ept) ept_sync_global(); return 0; } static void vmclear_local_loaded_vmcss(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v, *n; list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), loaded_vmcss_on_cpu_link) __loaded_vmcs_clear(v); } static void hardware_disable(void) { vmclear_local_loaded_vmcss(); if (cpu_vmxoff()) kvm_spurious_fault(); intel_pt_handle_vmx(0); } /* * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID * directly instead of going through cpu_has(), to ensure KVM is trapping * ENCLS whenever it's supported in hardware. It does not matter whether * the host OS supports or has enabled SGX. */ static bool cpu_has_sgx(void) { return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { u32 vmx_msr_low, vmx_msr_high; u32 ctl = ctl_min | ctl_opt; rdmsr(msr, vmx_msr_low, vmx_msr_high); ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ /* Ensure minimum (required) set of control bits are supported. */ if (ctl_min & ~ctl) return -EIO; *result = ctl; return 0; } static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { u32 vmx_msr_low, vmx_msr_high; u32 min, opt, min2, opt2; u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; memset(vmcs_conf, 0, sizeof(*vmcs_conf)); min = CPU_BASED_HLT_EXITING | #ifdef CONFIG_X86_64 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | #endif CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETTING | CPU_BASED_MWAIT_EXITING | CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_RDPMC_EXITING; opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) return -EIO; #ifdef CONFIG_X86_64 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & ~CPU_BASED_CR8_STORE_EXITING; #endif if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { min2 = 0; opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_PAUSE_LOOP_EXITING | SECONDARY_EXEC_DESC | SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_XSAVES | SECONDARY_EXEC_RDSEED_EXITING | SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_PML | SECONDARY_EXEC_TSC_SCALING | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX | SECONDARY_EXEC_ENABLE_VMFUNC | SECONDARY_EXEC_BUS_LOCK_DETECTION; if (cpu_has_sgx()) opt2 |= SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) return -EIO; } #ifndef CONFIG_X86_64 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; #endif if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) _cpu_based_2nd_exec_control &= ~( SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, &vmx_cap->ept, &vmx_cap->vpid); if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { /* CR3 accesses and invlpg don't need to cause VM Exits when EPT enabled */ _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | CPU_BASED_INVLPG_EXITING); } else if (vmx_cap->ept) { vmx_cap->ept = 0; pr_warn_once("EPT CAP should not exist if not support " "1-setting enable EPT VM-execution control\n"); } if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && vmx_cap->vpid) { vmx_cap->vpid = 0; pr_warn_once("VPID CAP should not exist if not support " "1-setting enable VPID VM-execution control\n"); } min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; if (cpu_has_broken_vmx_preemption_timer()) _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; min = VM_ENTRY_LOAD_DEBUG_CONTROLS; opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER | VM_ENTRY_LOAD_BNDCFGS | VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; /* * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they * can't be used due to an errata where VM Exit may incorrectly clear * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. */ if (boot_cpu_data.x86 == 0x6) { switch (boot_cpu_data.x86_model) { case 26: /* AAK155 */ case 30: /* AAP115 */ case 37: /* AAT100 */ case 44: /* BC86,AAY89,BD102 */ case 46: /* BA97 */ _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " "does not work properly. Using workaround\n"); break; default: break; } } rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) return -EIO; #ifdef CONFIG_X86_64 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ if (vmx_msr_high & (1u<<16)) return -EIO; #endif /* Require Write-Back (WB) memory type for VMCS accesses. */ if (((vmx_msr_high >> 18) & 15) != 6) return -EIO; vmcs_conf->size = vmx_msr_high & 0x1fff; vmcs_conf->order = get_order(vmcs_conf->size); vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; vmcs_conf->revision_id = vmx_msr_low; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; #if IS_ENABLED(CONFIG_HYPERV) if (enlightened_vmcs) evmcs_sanitize_exec_ctrls(vmcs_conf); #endif return 0; } struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) { int node = cpu_to_node(cpu); struct page *pages; struct vmcs *vmcs; pages = __alloc_pages_node(node, flags, vmcs_config.order); if (!pages) return NULL; vmcs = page_address(pages); memset(vmcs, 0, vmcs_config.size); /* KVM supports Enlightened VMCS v1 only */ if (static_branch_unlikely(&enable_evmcs)) vmcs->hdr.revision_id = KVM_EVMCS_VERSION; else vmcs->hdr.revision_id = vmcs_config.revision_id; if (shadow) vmcs->hdr.shadow_vmcs = 1; return vmcs; } void free_vmcs(struct vmcs *vmcs) { free_pages((unsigned long)vmcs, vmcs_config.order); } /* * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded */ void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) { if (!loaded_vmcs->vmcs) return; loaded_vmcs_clear(loaded_vmcs); free_vmcs(loaded_vmcs->vmcs); loaded_vmcs->vmcs = NULL; if (loaded_vmcs->msr_bitmap) free_page((unsigned long)loaded_vmcs->msr_bitmap); WARN_ON(loaded_vmcs->shadow_vmcs != NULL); } int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) { loaded_vmcs->vmcs = alloc_vmcs(false); if (!loaded_vmcs->vmcs) return -ENOMEM; vmcs_clear(loaded_vmcs->vmcs); loaded_vmcs->shadow_vmcs = NULL; loaded_vmcs->hv_timer_soft_disabled = false; loaded_vmcs->cpu = -1; loaded_vmcs->launched = 0; if (cpu_has_vmx_msr_bitmap()) { loaded_vmcs->msr_bitmap = (unsigned long *) __get_free_page(GFP_KERNEL_ACCOUNT); if (!loaded_vmcs->msr_bitmap) goto out_vmcs; memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); } memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); memset(&loaded_vmcs->controls_shadow, 0, sizeof(struct vmcs_controls_shadow)); return 0; out_vmcs: free_loaded_vmcs(loaded_vmcs); return -ENOMEM; } static void free_kvm_area(void) { int cpu; for_each_possible_cpu(cpu) { free_vmcs(per_cpu(vmxarea, cpu)); per_cpu(vmxarea, cpu) = NULL; } } static __init int alloc_kvm_area(void) { int cpu; for_each_possible_cpu(cpu) { struct vmcs *vmcs; vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); if (!vmcs) { free_kvm_area(); return -ENOMEM; } /* * When eVMCS is enabled, alloc_vmcs_cpu() sets * vmcs->revision_id to KVM_EVMCS_VERSION instead of * revision_id reported by MSR_IA32_VMX_BASIC. * * However, even though not explicitly documented by * TLFS, VMXArea passed as VMXON argument should * still be marked with revision_id reported by * physical CPU. */ if (static_branch_unlikely(&enable_evmcs)) vmcs->hdr.revision_id = vmcs_config.revision_id; per_cpu(vmxarea, cpu) = vmcs; } return 0; } static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) { if (!emulate_invalid_guest_state) { /* * CS and SS RPL should be equal during guest entry according * to VMX spec, but in reality it is not always so. Since vcpu * is in the middle of the transition from real mode to * protected mode it is safe to assume that RPL 0 is a good * default value. */ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) save->selector &= ~SEGMENT_RPL_MASK; save->dpl = save->selector & SEGMENT_RPL_MASK; save->s = 1; } __vmx_set_segment(vcpu, save, seg); } static void enter_pmode(struct kvm_vcpu *vcpu) { unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Update real mode segment cache. It may be not up-to-date if segment * register was written while vcpu was in a guest mode. */ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); vmx->rmode.vm86_active = 0; __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); flags = vmcs_readl(GUEST_RFLAGS); flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); vmx_update_exception_bitmap(vcpu); fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); } static void fix_rmode_seg(int seg, struct kvm_segment *save) { const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; struct kvm_segment var = *save; var.dpl = 0x3; if (seg == VCPU_SREG_CS) var.type = 0x3; if (!emulate_invalid_guest_state) { var.selector = var.base >> 4; var.base = var.base & 0xffff0; var.limit = 0xffff; var.g = 0; var.db = 0; var.present = 1; var.s = 1; var.l = 0; var.unusable = 0; var.type = 0x3; var.avl = 0; if (save->base & 0xf) printk_once(KERN_WARNING "kvm: segment base is not " "paragraph aligned when entering " "protected mode (seg=%d)", seg); } vmcs_write16(sf->selector, var.selector); vmcs_writel(sf->base, var.base); vmcs_write32(sf->limit, var.limit); vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); } static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); /* * KVM should never use VM86 to virtualize Real Mode when L2 is active, * as using VM86 is unnecessary if unrestricted guest is enabled, and * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 * should VM-Fail and KVM should reject userspace attempts to stuff * CR0.PG=0 when L2 is active. */ WARN_ON_ONCE(is_guest_mode(vcpu)); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); vmx->rmode.vm86_active = 1; /* * Very old userspace does not call KVM_SET_TSS_ADDR before entering * vcpu. Warn the user that an update is overdue. */ if (!kvm_vmx->tss_addr) printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " "called before entering vcpu\n"); vmx_segment_cache_clear(vmx); vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); vmx->rmode.save_rflags = flags; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); vmx_update_exception_bitmap(vcpu); fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); } int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER); /* Nothing to do if hardware doesn't support EFER. */ if (!msr) return 0; vcpu->arch.efer = efer; if (efer & EFER_LMA) { vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); msr->data = efer; } else { vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); msr->data = efer & ~EFER_LME; } vmx_setup_uret_msrs(vmx); return 0; } #ifdef CONFIG_X86_64 static void enter_lmode(struct kvm_vcpu *vcpu) { u32 guest_tr_ar; vmx_segment_cache_clear(to_vmx(vcpu)); guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { pr_debug_ratelimited("%s: tss fixup for long mode. \n", __func__); vmcs_write32(GUEST_TR_AR_BYTES, (guest_tr_ar & ~VMX_AR_TYPE_MASK) | VMX_AR_TYPE_BUSY_64_TSS); } vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); } static void exit_lmode(struct kvm_vcpu *vcpu) { vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); } #endif static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* * INVEPT must be issued when EPT is enabled, irrespective of VPID, as * the CPU is not required to invalidate guest-physical mappings on * VM-Entry, even if VPID is disabled. Guest-physical mappings are * associated with the root EPT structure and not any particular VPID * (INVVPID also isn't required to invalidate guest-physical mappings). */ if (enable_ept) { ept_sync_global(); } else if (enable_vpid) { if (cpu_has_vmx_invvpid_global()) { vpid_sync_vcpu_global(); } else { vpid_sync_vcpu_single(vmx->vpid); vpid_sync_vcpu_single(vmx->nested.vpid02); } } } static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu))) return nested_get_vpid02(vcpu); return to_vmx(vcpu)->vpid; } static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; u64 root_hpa = mmu->root_hpa; /* No flush required if the current context is invalid. */ if (!VALID_PAGE(root_hpa)) return; if (enable_ept) ept_sync_context(construct_eptp(vcpu, root_hpa, mmu->shadow_root_level)); else vpid_sync_context(vmx_get_current_vpid(vcpu)); } static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) { /* * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in * vmx_flush_tlb_guest() for an explanation of why this is ok. */ vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); } static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) { /* * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are * required to flush GVA->{G,H}PA mappings from the TLB if vpid is * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), * i.e. no explicit INVVPID is necessary. */ vpid_sync_context(vmx_get_current_vpid(vcpu)); } void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) return; if (is_pae_paging(vcpu)) { vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); } } void ept_save_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; if (WARN_ON_ONCE(!is_pae_paging(vcpu))) return; mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); } #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ CPU_BASED_CR3_STORE_EXITING) static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { if (is_guest_mode(vcpu)) return nested_guest_cr0_valid(vcpu, cr0); if (to_vmx(vcpu)->nested.vmxon) return nested_host_cr0_valid(vcpu, cr0); return true; } void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long hw_cr0, old_cr0_pg; u32 tmp; old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); if (enable_unrestricted_guest) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; if (!enable_ept) hw_cr0 |= X86_CR0_WP; if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) enter_rmode(vcpu); } vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { if (!old_cr0_pg && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) exit_lmode(vcpu); } #endif if (enable_ept && !enable_unrestricted_guest) { /* * Ensure KVM has an up-to-date snapshot of the guest's CR3. If * the below code _enables_ CR3 exiting, vmx_cache_reg() will * (correctly) stop reading vmcs.GUEST_CR3 because it thinks * KVM's CR3 is installed. */ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) vmx_cache_reg(vcpu, VCPU_EXREG_CR3); /* * When running with EPT but not unrestricted guest, KVM must * intercept CR3 accesses when paging is _disabled_. This is * necessary because restricted guests can't actually run with * paging disabled, and so KVM stuffs its own CR3 in order to * run the guest when identity mapped page tables. * * Do _NOT_ check the old CR0.PG, e.g. to optimize away the * update, it may be stale with respect to CR3 interception, * e.g. after nested VM-Enter. * * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or * stores to forward them to L1, even if KVM does not need to * intercept them to preserve its identity mapped page tables. */ if (!(cr0 & X86_CR0_PG)) { exec_controls_setbit(vmx, CR3_EXITING_BITS); } else if (!is_guest_mode(vcpu)) { exec_controls_clearbit(vmx, CR3_EXITING_BITS); } else { tmp = exec_controls_get(vmx); tmp &= ~CR3_EXITING_BITS; tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; exec_controls_set(vmx, tmp); } /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ if ((old_cr0_pg ^ cr0) & X86_CR0_PG) vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } /* depends on vcpu->arch.cr0 to be set to a new value */ vmx->emulation_required = vmx_emulation_required(vcpu); } static int vmx_get_max_tdp_level(void) { if (cpu_has_vmx_ept_5levels()) return 5; return 4; } u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { u64 eptp = VMX_EPTP_MT_WB; eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; if (enable_ept_ad_bits && (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) eptp |= VMX_EPTP_AD_ENABLE_BIT; eptp |= root_hpa; return eptp; } static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; unsigned long guest_cr3; u64 eptp; if (enable_ept) { eptp = construct_eptp(vcpu, root_hpa, root_level); vmcs_write64(EPT_POINTER, eptp); hv_track_root_tdp(vcpu, root_hpa); if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) guest_cr3 = vcpu->arch.cr3; else /* vmcs01.GUEST_CR3 is already up-to-date. */ update_guest_cr3 = false; vmx_ept_load_pdptrs(vcpu); } else { guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu); } if (update_guest_cr3) vmcs_writel(GUEST_CR3, guest_cr3); } static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* * We operate under the default treatment of SMM, so VMX cannot be * enabled under SMM. Note, whether or not VMXE is allowed at all, * i.e. is a reserved bit, is handled by common x86 code. */ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) return false; if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) return false; return true; } void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = vcpu->arch.cr4; struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Pass through host's Machine Check Enable value to hw_cr4, which * is in force while we are in guest mode. Do not let guests control * this bit, even if host CR4.MCE == 0. */ unsigned long hw_cr4; hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); if (enable_unrestricted_guest) hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; else if (vmx->rmode.vm86_active) hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; else hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { if (cr4 & X86_CR4_UMIP) { secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); hw_cr4 &= ~X86_CR4_UMIP; } else if (!is_guest_mode(vcpu) || !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); } } vcpu->arch.cr4 = cr4; kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); if (!enable_unrestricted_guest) { if (enable_ept) { if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } } /* * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in * hardware. To emulate this behavior, SMEP/SMAP/PKU needs * to be manually disabled when guest switches to non-paging * mode. * * If !enable_unrestricted_guest, the CPU is always running * with CR0.PG=1 and CR4 needs to be modified. * If enable_unrestricted_guest, the CPU automatically * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. */ if (!is_paging(vcpu)) hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); } vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) kvm_update_cpuid_runtime(vcpu); } void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 ar; if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { *var = vmx->rmode.segs[seg]; if (seg == VCPU_SREG_TR || var->selector == vmx_read_guest_seg_selector(vmx, seg)) return; var->base = vmx_read_guest_seg_base(vmx, seg); var->selector = vmx_read_guest_seg_selector(vmx, seg); return; } var->base = vmx_read_guest_seg_base(vmx, seg); var->limit = vmx_read_guest_seg_limit(vmx, seg); var->selector = vmx_read_guest_seg_selector(vmx, seg); ar = vmx_read_guest_seg_ar(vmx, seg); var->unusable = (ar >> 16) & 1; var->type = ar & 15; var->s = (ar >> 4) & 1; var->dpl = (ar >> 5) & 3; /* * Some userspaces do not preserve unusable property. Since usable * segment has to be present according to VMX spec we can use present * property to amend userspace bug by making unusable segment always * nonpresent. vmx_segment_access_rights() already marks nonpresent * segment as unusable. */ var->present = !var->unusable; var->avl = (ar >> 12) & 1; var->l = (ar >> 13) & 1; var->db = (ar >> 14) & 1; var->g = (ar >> 15) & 1; } static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) { struct kvm_segment s; if (to_vmx(vcpu)->rmode.vm86_active) { vmx_get_segment(vcpu, &s, seg); return s.base; } return vmx_read_guest_seg_base(to_vmx(vcpu), seg); } int vmx_get_cpl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (unlikely(vmx->rmode.vm86_active)) return 0; else { int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); return VMX_AR_DPL(ar); } } static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; ar = var->type & 15; ar |= (var->s & 1) << 4; ar |= (var->dpl & 3) << 5; ar |= (var->present & 1) << 7; ar |= (var->avl & 1) << 12; ar |= (var->l & 1) << 13; ar |= (var->db & 1) << 14; ar |= (var->g & 1) << 15; ar |= (var->unusable || !var->present) << 16; return ar; } void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; vmx_segment_cache_clear(vmx); if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { vmx->rmode.segs[seg] = *var; if (seg == VCPU_SREG_TR) vmcs_write16(sf->selector, var->selector); else if (var->s) fix_rmode_seg(seg, &vmx->rmode.segs[seg]); return; } vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); /* * Fix the "Accessed" bit in AR field of segment registers for older * qemu binaries. * IA32 arch specifies that at the time of processor reset the * "Accessed" bit in the AR field of segment registers is 1. And qemu * is setting it to 0 in the userland code. This causes invalid guest * state vmexit when "unrestricted guest" mode is turned on. * Fix for this setup issue in cpu_reset is being pushed in the qemu * tree. Newer qemu binaries with that qemu fix would not need this * kvm hack. */ if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) var->type |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); } static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { __vmx_set_segment(vcpu, var, seg); to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) { u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); *db = (ar >> 14) & 1; *l = (ar >> 13) & 1; } static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { dt->size = vmcs_read32(GUEST_IDTR_LIMIT); dt->address = vmcs_readl(GUEST_IDTR_BASE); } static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { vmcs_write32(GUEST_IDTR_LIMIT, dt->size); vmcs_writel(GUEST_IDTR_BASE, dt->address); } static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { dt->size = vmcs_read32(GUEST_GDTR_LIMIT); dt->address = vmcs_readl(GUEST_GDTR_BASE); } static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { vmcs_write32(GUEST_GDTR_LIMIT, dt->size); vmcs_writel(GUEST_GDTR_BASE, dt->address); } static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) { struct kvm_segment var; u32 ar; vmx_get_segment(vcpu, &var, seg); var.dpl = 0x3; if (seg == VCPU_SREG_CS) var.type = 0x3; ar = vmx_segment_access_rights(&var); if (var.base != (var.selector << 4)) return false; if (var.limit != 0xffff) return false; if (ar != 0xf3) return false; return true; } static bool code_segment_valid(struct kvm_vcpu *vcpu) { struct kvm_segment cs; unsigned int cs_rpl; vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); cs_rpl = cs.selector & SEGMENT_RPL_MASK; if (cs.unusable) return false; if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) return false; if (!cs.s) return false; if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { if (cs.dpl > cs_rpl) return false; } else { if (cs.dpl != cs_rpl) return false; } if (!cs.present) return false; /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ return true; } static bool stack_segment_valid(struct kvm_vcpu *vcpu) { struct kvm_segment ss; unsigned int ss_rpl; vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); ss_rpl = ss.selector & SEGMENT_RPL_MASK; if (ss.unusable) return true; if (ss.type != 3 && ss.type != 7) return false; if (!ss.s) return false; if (ss.dpl != ss_rpl) /* DPL != RPL */ return false; if (!ss.present) return false; return true; } static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) { struct kvm_segment var; unsigned int rpl; vmx_get_segment(vcpu, &var, seg); rpl = var.selector & SEGMENT_RPL_MASK; if (var.unusable) return true; if (!var.s) return false; if (!var.present) return false; if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { if (var.dpl < rpl) /* DPL < RPL */ return false; } /* TODO: Add other members to kvm_segment_field to allow checking for other access * rights flags */ return true; } static bool tr_valid(struct kvm_vcpu *vcpu) { struct kvm_segment tr; vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); if (tr.unusable) return false; if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ return false; if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ return false; if (!tr.present) return false; return true; } static bool ldtr_valid(struct kvm_vcpu *vcpu) { struct kvm_segment ldtr; vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); if (ldtr.unusable) return true; if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ return false; if (ldtr.type != 2) return false; if (!ldtr.present) return false; return true; } static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) { struct kvm_segment cs, ss; vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); return ((cs.selector & SEGMENT_RPL_MASK) == (ss.selector & SEGMENT_RPL_MASK)); } /* * Check if guest state is valid. Returns true if valid, false if * not. * We assume that registers are always usable */ bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) { /* real mode guest state checks */ if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) return false; } else { /* protected mode guest state checks */ if (!cs_ss_rpl_check(vcpu)) return false; if (!code_segment_valid(vcpu)) return false; if (!stack_segment_valid(vcpu)) return false; if (!data_segment_valid(vcpu, VCPU_SREG_DS)) return false; if (!data_segment_valid(vcpu, VCPU_SREG_ES)) return false; if (!data_segment_valid(vcpu, VCPU_SREG_FS)) return false; if (!data_segment_valid(vcpu, VCPU_SREG_GS)) return false; if (!tr_valid(vcpu)) return false; if (!ldtr_valid(vcpu)) return false; } /* TODO: * - Add checks on RIP * - Add checks on RFLAGS */ return true; } static int init_rmode_tss(struct kvm *kvm, void __user *ua) { const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); u16 data; int i; for (i = 0; i < 3; i++) { if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) return -EFAULT; } data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) return -EFAULT; data = ~0; if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) return -EFAULT; return 0; } static int init_rmode_identity_map(struct kvm *kvm) { struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); int i, r = 0; void __user *uaddr; u32 tmp; /* Protect kvm_vmx->ept_identity_pagetable_done. */ mutex_lock(&kvm->slots_lock); if (likely(kvm_vmx->ept_identity_pagetable_done)) goto out; if (!kvm_vmx->ept_identity_map_addr) kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; uaddr = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, kvm_vmx->ept_identity_map_addr, PAGE_SIZE); if (IS_ERR(uaddr)) { r = PTR_ERR(uaddr); goto out; } /* Set up identity-mapping pagetable for EPT in real mode */ for (i = 0; i < PT32_ENT_PER_PAGE; i++) { tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { r = -EFAULT; goto out; } } kvm_vmx->ept_identity_pagetable_done = true; out: mutex_unlock(&kvm->slots_lock); return r; } static void seg_setup(int seg) { const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; unsigned int ar; vmcs_write16(sf->selector, 0); vmcs_writel(sf->base, 0); vmcs_write32(sf->limit, 0xffff); ar = 0x93; if (seg == VCPU_SREG_CS) ar |= 0x08; /* code segment */ vmcs_write32(sf->ar_bytes, ar); } static int alloc_apic_access_page(struct kvm *kvm) { struct page *page; void __user *hva; int ret = 0; mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_memslot_enabled) goto out; hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); if (IS_ERR(hva)) { ret = PTR_ERR(hva); goto out; } page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (is_error_page(page)) { ret = -EFAULT; goto out; } /* * Do not pin the page in memory, so that memory hot-unplug * is able to migrate it. */ put_page(page); kvm->arch.apic_access_memslot_enabled = true; out: mutex_unlock(&kvm->slots_lock); return ret; } int allocate_vpid(void) { int vpid; if (!enable_vpid) return 0; spin_lock(&vmx_vpid_lock); vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); if (vpid < VMX_NR_VPIDS) __set_bit(vpid, vmx_vpid_bitmap); else vpid = 0; spin_unlock(&vmx_vpid_lock); return vpid; } void free_vpid(int vpid) { if (!enable_vpid || vpid == 0) return; spin_lock(&vmx_vpid_lock); __clear_bit(vpid, vmx_vpid_bitmap); spin_unlock(&vmx_vpid_lock); } static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) { /* * When KVM is a nested hypervisor on top of Hyper-V and uses * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR * bitmap has changed. */ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) { struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; if (evmcs->hv_enlightenments_control.msr_bitmap) evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; } } void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; if (!cpu_has_vmx_msr_bitmap()) return; vmx_msr_bitmap_l01_changed(vmx); /* * Mark the desired intercept state in shadow bitmap, this is needed * for resync when the MSR filters change. */ if (is_valid_passthrough_msr(msr)) { int idx = possible_passthrough_msr_slot(msr); if (idx != -ENOENT) { if (type & MSR_TYPE_R) clear_bit(idx, vmx->shadow_msr_intercept.read); if (type & MSR_TYPE_W) clear_bit(idx, vmx->shadow_msr_intercept.write); } } if ((type & MSR_TYPE_R) && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { vmx_set_msr_bitmap_read(msr_bitmap, msr); type &= ~MSR_TYPE_R; } if ((type & MSR_TYPE_W) && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { vmx_set_msr_bitmap_write(msr_bitmap, msr); type &= ~MSR_TYPE_W; } if (type & MSR_TYPE_R) vmx_clear_msr_bitmap_read(msr_bitmap, msr); if (type & MSR_TYPE_W) vmx_clear_msr_bitmap_write(msr_bitmap, msr); } void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; if (!cpu_has_vmx_msr_bitmap()) return; vmx_msr_bitmap_l01_changed(vmx); /* * Mark the desired intercept state in shadow bitmap, this is needed * for resync when the MSR filter changes. */ if (is_valid_passthrough_msr(msr)) { int idx = possible_passthrough_msr_slot(msr); if (idx != -ENOENT) { if (type & MSR_TYPE_R) set_bit(idx, vmx->shadow_msr_intercept.read); if (type & MSR_TYPE_W) set_bit(idx, vmx->shadow_msr_intercept.write); } } if (type & MSR_TYPE_R) vmx_set_msr_bitmap_read(msr_bitmap, msr); if (type & MSR_TYPE_W) vmx_set_msr_bitmap_write(msr_bitmap, msr); } static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) { unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; unsigned long read_intercept; int msr; read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { unsigned int read_idx = msr / BITS_PER_LONG; unsigned int write_idx = read_idx + (0x800 / sizeof(long)); msr_bitmap[read_idx] = read_intercept; msr_bitmap[write_idx] = ~0ul; } } static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u8 mode; if (!cpu_has_vmx_msr_bitmap()) return; if (cpu_has_secondary_exec_ctrls() && (secondary_exec_controls_get(vmx) & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { mode = MSR_BITMAP_MODE_X2APIC; if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) mode |= MSR_BITMAP_MODE_X2APIC_APICV; } else { mode = 0; } if (mode == vmx->x2apic_msr_bitmap_mode) return; vmx->x2apic_msr_bitmap_mode = mode; vmx_reset_x2apic_msrs(vcpu, mode); /* * TPR reads and writes can be virtualized even if virtual interrupt * delivery is not in use. */ vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, !(mode & MSR_BITMAP_MODE_X2APIC)); if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); } } void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); u32 i; vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); for (i = 0; i < vmx->pt_desc.addr_range; i++) { vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); } } static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); void *vapic_page; u32 vppr; int rvi; if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || !nested_cpu_has_vid(get_vmcs12(vcpu)) || WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn)) return false; rvi = vmx_get_rvi(); vapic_page = vmx->nested.virtual_apic_map.hva; vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); return ((rvi & 0xf0) > (vppr & 0xf0)); } static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 i; /* * Set intercept permissions for all potentially passed through MSRs * again. They will automatically get filtered through the MSR filter, * so we are back in sync after this. */ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { u32 msr = vmx_possible_passthrough_msrs[i]; bool read = test_bit(i, vmx->shadow_msr_intercept.read); bool write = test_bit(i, vmx->shadow_msr_intercept.write); vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read); vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write); } pt_update_intercept_for_msr(vcpu); } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, bool nested) { #ifdef CONFIG_SMP int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; if (vcpu->mode == IN_GUEST_MODE) { /* * The vector of interrupt to be delivered to vcpu had * been set in PIR before this function. * * Following cases will be reached in this block, and * we always send a notification event in all cases as * explained below. * * Case 1: vcpu keeps in non-root mode. Sending a * notification event posts the interrupt to vcpu. * * Case 2: vcpu exits to root mode and is still * runnable. PIR will be synced to vIRR before the * next vcpu entry. Sending a notification event in * this case has no effect, as vcpu is not in root * mode. * * Case 3: vcpu exits to root mode and is blocked. * vcpu_block() has already synced PIR to vIRR and * never blocks vcpu if vIRR is not cleared. Therefore, * a blocked vcpu here does not wait for any requested * interrupts in PIR, and sending a notification event * which has no effect is safe here. */ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); return true; } #endif return false; } static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (is_guest_mode(vcpu) && vector == vmx->nested.posted_intr_nv) { /* * If a posted intr is not recognized by hardware, * we will accomplish it in the next vmentry. */ vmx->nested.pi_pending = true; kvm_make_request(KVM_REQ_EVENT, vcpu); /* the PIR and ON have been set by L1. */ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) kvm_vcpu_kick(vcpu); return 0; } return -1; } /* * Send interrupt to vcpu via posted interrupt way. * 1. If target vcpu is running(non-root mode), send posted interrupt * notification to vcpu and hardware will sync PIR to vIRR atomically. * 2. If target vcpu isn't running(root mode), kick it to pick up the * interrupt from PIR in next vmentry. */ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { struct vcpu_vmx *vmx = to_vmx(vcpu); int r; r = vmx_deliver_nested_posted_interrupt(vcpu, vector); if (!r) return 0; if (!vcpu->arch.apicv_active) return -1; if (pi_test_and_set_pir(vector, &vmx->pi_desc)) return 0; /* If a previous notification has sent the IPI, nothing to do. */ if (pi_test_and_set_on(&vmx->pi_desc)) return 0; if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) kvm_vcpu_kick(vcpu); return 0; } /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. * Note that host-state that does change is set elsewhere. E.g., host-state * that is set differently for each CPU is set in vmx_vcpu_load(), not here. */ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) { u32 low32, high32; unsigned long tmpl; unsigned long cr0, cr3, cr4; cr0 = read_cr0(); WARN_ON(cr0 & X86_CR0_TS); vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ /* * Save the most likely value for this task's CR3 in the VMCS. * We can't use __get_current_cr3_fast() because we're not atomic. */ cr3 = __read_cr3(); vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ vmx->loaded_vmcs->host_state.cr3 = cr3; /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ vmx->loaded_vmcs->host_state.cr4 = cr4; vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 /* * Load null selectors, so we can avoid reloading them in * vmx_prepare_switch_to_host(), in case userspace uses * the null selectors too (the expected case). */ vmcs_write16(HOST_DS_SELECTOR, 0); vmcs_write16(HOST_ES_SELECTOR, 0); #else vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ #endif vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); vmcs_write32(HOST_IA32_SYSENTER_CS, low32); rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { rdmsr(MSR_IA32_CR_PAT, low32, high32); vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); } if (cpu_has_load_ia32_efer()) vmcs_write64(HOST_IA32_EFER, host_efer); } void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) { struct kvm_vcpu *vcpu = &vmx->vcpu; vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & ~vcpu->arch.cr4_guest_rsvd_bits; if (!enable_ept) vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; if (is_guest_mode(&vmx->vcpu)) vcpu->arch.cr4_guest_owned_bits &= ~get_vmcs12(vcpu)->cr4_guest_host_mask; vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); } static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; if (!kvm_vcpu_apicv_active(&vmx->vcpu)) pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; if (!enable_vnmi) pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; if (!enable_preemption_timer) pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; return pin_based_exec_ctrl; } static u32 vmx_vmentry_ctrl(void) { u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; if (vmx_pt_mode_is_system()) vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmentry_ctrl & ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); } static u32 vmx_vmexit_ctrl(void) { u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; if (vmx_pt_mode_is_system()) vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmexit_ctrl & ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); } static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (is_guest_mode(vcpu)) { vmx->nested.update_vmcs01_apicv_status = true; return; } pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); if (cpu_has_secondary_exec_ctrls()) { if (kvm_vcpu_apicv_active(vcpu)) secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); else secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); } vmx_update_msr_bitmap_x2apic(vcpu); } static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) exec_control &= ~CPU_BASED_MOV_DR_EXITING; if (!cpu_need_tpr_shadow(&vmx->vcpu)) { exec_control &= ~CPU_BASED_TPR_SHADOW; #ifdef CONFIG_X86_64 exec_control |= CPU_BASED_CR8_STORE_EXITING | CPU_BASED_CR8_LOAD_EXITING; #endif } if (!enable_ept) exec_control |= CPU_BASED_CR3_STORE_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_INVLPG_EXITING; if (kvm_mwait_in_guest(vmx->vcpu.kvm)) exec_control &= ~(CPU_BASED_MWAIT_EXITING | CPU_BASED_MONITOR_EXITING); if (kvm_hlt_in_guest(vmx->vcpu.kvm)) exec_control &= ~CPU_BASED_HLT_EXITING; return exec_control; } /* * Adjust a single secondary execution control bit to intercept/allow an * instruction in the guest. This is usually done based on whether or not a * feature has been exposed to the guest in order to correctly emulate faults. */ static inline void vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, u32 control, bool enabled, bool exiting) { /* * If the control is for an opt-in feature, clear the control if the * feature is not exposed to the guest, i.e. not enabled. If the * control is opt-out, i.e. an exiting control, clear the control if * the feature _is_ exposed to the guest, i.e. exiting/interception is * disabled for the associated instruction. Note, the caller is * responsible presetting exec_control to set all supported bits. */ if (enabled == exiting) *exec_control &= ~control; /* * Update the nested MSR settings so that a nested VMM can/can't set * controls for features that are/aren't exposed to the guest. */ if (nested) { if (enabled) vmx->nested.msrs.secondary_ctls_high |= control; else vmx->nested.msrs.secondary_ctls_high &= ~control; } } /* * Wrapper macro for the common case of adjusting a secondary execution control * based on a single guest CPUID bit, with a dedicated feature bit. This also * verifies that the control is actually supported by KVM and hardware. */ #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ ({ \ bool __enabled; \ \ if (cpu_has_vmx_##name()) { \ __enabled = guest_cpuid_has(&(vmx)->vcpu, \ X86_FEATURE_##feat_name); \ vmx_adjust_secondary_exec_control(vmx, exec_control, \ SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \ } \ }) /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { struct kvm_vcpu *vcpu = &vmx->vcpu; u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; if (vmx_pt_mode_is_system()) exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); if (!cpu_need_virtualize_apic_accesses(vcpu)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; enable_unrestricted_guest = 0; } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (kvm_pause_in_guest(vmx->vcpu.kvm)) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; if (!kvm_vcpu_apicv_active(vcpu)) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, * in vmx_set_cr4. */ exec_control &= ~SECONDARY_EXEC_DESC; /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD (handle_vmptrld). We can NOT enable shadow_vmcs here because we don't have yet a current VMCS12 */ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; /* * PML is enabled/disabled when dirty logging of memsmlots changes, but * it needs to be set here when dirty logging is already active, e.g. * if this vCPU was created after dirty logging was enabled. */ if (!vcpu->kvm->arch.cpu_dirty_logging_count) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; if (cpu_has_vmx_xsaves()) { /* Exposing XSAVES only when XSAVE is exposed */ bool xsaves_enabled = boot_cpu_has(X86_FEATURE_XSAVE) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); vcpu->arch.xsaves_enabled = xsaves_enabled; vmx_adjust_secondary_exec_control(vmx, &exec_control, SECONDARY_EXEC_XSAVES, xsaves_enabled, false); } /* * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either * feature is exposed to the guest. This creates a virtualization hole * if both are supported in hardware but only one is exposed to the * guest, but letting the guest execute RDTSCP or RDPID when either one * is advertised is preferable to emulating the advertised instruction * in KVM on #UD, and obviously better than incorrectly injecting #UD. */ if (cpu_has_vmx_rdtscp()) { bool rdpid_or_rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || guest_cpuid_has(vcpu, X86_FEATURE_RDPID); vmx_adjust_secondary_exec_control(vmx, &exec_control, SECONDARY_EXEC_ENABLE_RDTSCP, rdpid_or_rdtscp_enabled, false); } vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, ENABLE_USR_WAIT_PAUSE, false); if (!vcpu->kvm->arch.bus_lock_detection_enabled) exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; return exec_control; } #define VMX_XSS_EXIT_BITMAP 0 /* * Noting that the initialization of Guest-state Area of VMCS is in * vmx_vcpu_reset(). */ static void init_vmcs(struct vcpu_vmx *vmx) { if (nested) nested_vmx_set_vmcs_shadowing_bitmap(); if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ /* Control */ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); exec_controls_set(vmx, vmx_exec_control(vmx)); if (cpu_has_secondary_exec_ctrls()) secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); if (kvm_vcpu_apicv_active(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); vmcs_write64(EOI_EXIT_BITMAP2, 0); vmcs_write64(EOI_EXIT_BITMAP3, 0); vmcs_write16(GUEST_INTR_STATUS, 0); vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); } if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { vmcs_write32(PLE_GAP, ple_gap); vmx->ple_window = ple_window; vmx->ple_window_dirty = true; } vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ vmx_set_constant_host_state(vmx); vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); /* 22.2.1, 20.8.1 */ vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); set_cr4_guest_host_mask(vmx); if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); if (cpu_has_vmx_xsaves()) vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); if (enable_pml) { vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } vmx_write_encls_bitmap(&vmx->vcpu, NULL); if (vmx_pt_mode_is_host_guest()) { memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); /* Bit[6~0] are forced to 1, writes are ignored. */ vmx->pt_desc.guest.output_mask = 0x7F; vmcs_write64(GUEST_IA32_RTIT_CTL, 0); } vmcs_write32(GUEST_SYSENTER_CS, 0); vmcs_writel(GUEST_SYSENTER_ESP, 0); vmcs_writel(GUEST_SYSENTER_EIP, 0); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); if (cpu_has_vmx_tpr_shadow()) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); if (cpu_need_tpr_shadow(&vmx->vcpu)) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, __pa(vmx->vcpu.arch.apic->regs)); vmcs_write32(TPR_THRESHOLD, 0); } vmx_setup_uret_msrs(vmx); } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; vmx->msr_ia32_umwait_control = 0; vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); vmx_segment_cache_clear(vmx); seg_setup(VCPU_SREG_CS); vmcs_write16(GUEST_CS_SELECTOR, 0xf000); vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); seg_setup(VCPU_SREG_DS); seg_setup(VCPU_SREG_ES); seg_setup(VCPU_SREG_FS); seg_setup(VCPU_SREG_GS); seg_setup(VCPU_SREG_SS); vmcs_write16(GUEST_TR_SELECTOR, 0); vmcs_writel(GUEST_TR_BASE, 0); vmcs_write32(GUEST_TR_LIMIT, 0xffff); vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); vmcs_write16(GUEST_LDTR_SELECTOR, 0); vmcs_writel(GUEST_LDTR_BASE, 0); vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); vmcs_writel(GUEST_GDTR_BASE, 0); vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); vmcs_writel(GUEST_IDTR_BASE, 0); vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); if (kvm_mpx_supported()) vmcs_write64(GUEST_BNDCFGS, 0); vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); vpid_sync_context(vmx->vpid); vmx_update_fb_clear_dis(vcpu, vmx); } static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) { exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); } static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) { if (!enable_vnmi || vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { vmx_enable_irq_window(vcpu); return; } exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } static void vmx_inject_irq(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); uint32_t intr; int irq = vcpu->arch.interrupt.nr; trace_kvm_inj_virq(irq); ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { int inc_eip = 0; if (vcpu->arch.interrupt.soft) inc_eip = vcpu->arch.event_exit_inst_len; kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); return; } intr = irq | INTR_INFO_VALID_MASK; if (vcpu->arch.interrupt.soft) { intr |= INTR_TYPE_SOFT_INTR; vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmx->vcpu.arch.event_exit_inst_len); } else intr |= INTR_TYPE_EXT_INTR; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); vmx_clear_hlt(vcpu); } static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (!enable_vnmi) { /* * Tracking the NMI-blocked state in software is built upon * finding the next open IRQ window. This, in turn, depends on * well-behaving guests: They have to keep IRQs disabled at * least as long as the NMI handler runs. Otherwise we may * cause NMI nesting, maybe breaking the guest. But as this is * highly unlikely, we can live with the residual risk. */ vmx->loaded_vmcs->soft_vnmi_blocked = 1; vmx->loaded_vmcs->vnmi_blocked_time = 0; } ++vcpu->stat.nmi_injections; vmx->loaded_vmcs->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); return; } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); vmx_clear_hlt(vcpu); } bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool masked; if (!enable_vnmi) return vmx->loaded_vmcs->soft_vnmi_blocked; if (vmx->loaded_vmcs->nmi_known_unmasked) return false; masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; vmx->loaded_vmcs->nmi_known_unmasked = !masked; return masked; } void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (!enable_vnmi) { if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { vmx->loaded_vmcs->soft_vnmi_blocked = masked; vmx->loaded_vmcs->vnmi_blocked_time = 0; } } else { vmx->loaded_vmcs->nmi_known_unmasked = !masked; if (masked) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); else vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); } } bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) return false; if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) return true; return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | GUEST_INTR_STATE_NMI)); } static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY; /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) return -EBUSY; return !vmx_nmi_blocked(vcpu); } bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu) { return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) return false; return __vmx_interrupt_blocked(vcpu); } static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY; /* * An IRQ must not be injected into L2 if it's supposed to VM-Exit, * e.g. if the IRQ arrived asynchronously after checking nested events. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) return -EBUSY; return !vmx_interrupt_blocked(vcpu); } static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { void __user *ret; if (enable_unrestricted_guest) return 0; mutex_lock(&kvm->slots_lock); ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, PAGE_SIZE * 3); mutex_unlock(&kvm->slots_lock); if (IS_ERR(ret)) return PTR_ERR(ret); to_kvm_vmx(kvm)->tss_addr = addr; return init_rmode_tss(kvm, ret); } static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; return 0; } static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) { switch (vec) { case BP_VECTOR: /* * Update instruction length as we may reinject the exception * from user space while in guest debugging mode. */ to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) return false; fallthrough; case DB_VECTOR: return !(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); case DE_VECTOR: case OF_VECTOR: case BR_VECTOR: case UD_VECTOR: case DF_VECTOR: case SS_VECTOR: case GP_VECTOR: case MF_VECTOR: return true; } return false; } static int handle_rmode_exception(struct kvm_vcpu *vcpu, int vec, u32 err_code) { /* * Instruction with address size override prefix opcode 0x67 * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { if (kvm_emulate_instruction(vcpu, 0)) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; return kvm_vcpu_halt(vcpu); } return 1; } return 0; } /* * Forward all other exceptions that are valid in real mode. * FIXME: Breaks guest debugging in real mode, needs to be fixed with * the required debugging infrastructure rework. */ kvm_queue_exception(vcpu, vec); return 1; } static int handle_machine_check(struct kvm_vcpu *vcpu) { /* handled by vmx_vcpu_run() */ return 1; } /* * If the host has split lock detection disabled, then #AC is * unconditionally injected into the guest, which is the pre split lock * detection behaviour. * * If the host has split lock detection enabled then #AC is * only injected into the guest when: * - Guest CPL == 3 (user mode) * - Guest has #AC detection enabled in CR0 * - Guest EFLAGS has AC bit set */ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) { if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) return true; return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) && (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); } static int handle_exception_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_run *kvm_run = vcpu->run; u32 intr_info, ex_no, error_code; unsigned long cr2, dr6; u32 vect_info; vect_info = vmx->idt_vectoring_info; intr_info = vmx_get_intr_info(vcpu); if (is_machine_check(intr_info) || is_nmi(intr_info)) return 1; /* handled by handle_exception_nmi_irqoff() */ if (is_invalid_opcode(intr_info)) return handle_ud(vcpu); error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { WARN_ON_ONCE(!enable_vmware_backdoor); /* * VMware backdoor emulation on #GP interception only handles * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero * error code on #GP. */ if (error_code) { kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); return 1; } return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); } /* * The #PF with PFEC.RSVD = 1 indicates the guest is accessing * MMIO, it is better to report an internal error. * See the comments in vmx_handle_exit. */ if ((vect_info & VECTORING_INFO_VALID_MASK) && !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; vcpu->run->internal.ndata = 4; vcpu->run->internal.data[0] = vect_info; vcpu->run->internal.data[1] = intr_info; vcpu->run->internal.data[2] = error_code; vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; return 0; } if (is_page_fault(intr_info)) { cr2 = vmx_get_exit_qual(vcpu); if (enable_ept && !vcpu->arch.apf.host_apf_flags) { /* * EPT will cause page fault only if we need to * detect illegal GPAs. */ WARN_ON_ONCE(!allow_smaller_maxphyaddr); kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); return 1; } else return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); } ex_no = intr_info & INTR_INFO_VECTOR_MASK; if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) return handle_rmode_exception(vcpu, ex_no, error_code); switch (ex_no) { case DB_VECTOR: dr6 = vmx_get_exit_qual(vcpu); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { /* * If the #DB was due to ICEBP, a.k.a. INT1, skip the * instruction. ICEBP generates a trap-like #DB, but * despite its interception control being tied to #DB, * is an instruction intercept, i.e. the VM-Exit occurs * on the ICEBP itself. Note, skipping ICEBP also * clears STI and MOVSS blocking. * * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS * if single-step is enabled in RFLAGS and STI or MOVSS * blocking is active, as the CPU doesn't set the bit * on VM-Exit due to #DB interception. VM-Entry has a * consistency check that a single-step #DB is pending * in this scenario as the previous instruction cannot * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV * don't modify RFLAGS), therefore the one instruction * delay when activating single-step breakpoints must * have already expired. Note, the CPU sets/clears BS * as appropriate for all other VM-Exits types. */ if (is_icebp(intr_info)) WARN_ON(!skip_emulated_instruction(vcpu)); else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); return 1; } kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); fallthrough; case BP_VECTOR: /* * Update instruction length as we may reinject #BP from * user space while in guest debugging mode. Reading it for * #DB as well causes no harm, it is not used in that case. */ vmx->vcpu.arch.event_exit_inst_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); kvm_run->debug.arch.exception = ex_no; break; case AC_VECTOR: if (vmx_guest_inject_ac(vcpu)) { kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); return 1; } /* * Handle split lock. Depending on detection mode this will * either warn and disable split lock detection for this * task or force SIGBUS on it. */ if (handle_guest_split_lock(kvm_rip_read(vcpu))) return 1; fallthrough; default: kvm_run->exit_reason = KVM_EXIT_EXCEPTION; kvm_run->ex.exception = ex_no; kvm_run->ex.error_code = error_code; break; } return 0; } static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) { ++vcpu->stat.irq_exits; return 1; } static int handle_triple_fault(struct kvm_vcpu *vcpu) { vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; vcpu->mmio_needed = 0; return 0; } static int handle_io(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; int size, in, string; unsigned port; exit_qualification = vmx_get_exit_qual(vcpu); string = (exit_qualification & 16) != 0; ++vcpu->stat.io_exits; if (string) return kvm_emulate_instruction(vcpu, 0); port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; in = (exit_qualification & 8) != 0; return kvm_fast_pio(vcpu, size, port, in); } static void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) { /* * Patch in the VMCALL instruction: */ hypercall[0] = 0x0f; hypercall[1] = 0x01; hypercall[2] = 0xc1; } /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) { if (is_guest_mode(vcpu)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned long orig_val = val; /* * We get here when L2 changed cr0 in a way that did not change * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), * but did change L0 shadowed bits. So we first calculate the * effective cr0 value that L1 would like to write into the * hardware. It consists of the L2-owned bits from the new * value combined with the L1-owned bits from L1's guest_cr0. */ val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); if (kvm_set_cr0(vcpu, val)) return 1; vmcs_writel(CR0_READ_SHADOW, orig_val); return 0; } else { return kvm_set_cr0(vcpu, val); } } static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) { if (is_guest_mode(vcpu)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned long orig_val = val; /* analogously to handle_set_cr0 */ val = (val & ~vmcs12->cr4_guest_host_mask) | (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); if (kvm_set_cr4(vcpu, val)) return 1; vmcs_writel(CR4_READ_SHADOW, orig_val); return 0; } else return kvm_set_cr4(vcpu, val); } static int handle_desc(struct kvm_vcpu *vcpu) { WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); return kvm_emulate_instruction(vcpu, 0); } static int handle_cr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification, val; int cr; int reg; int err; int ret; exit_qualification = vmx_get_exit_qual(vcpu); cr = exit_qualification & 15; reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ val = kvm_register_read(vcpu, reg); trace_kvm_cr_write(cr, val); switch (cr) { case 0: err = handle_set_cr0(vcpu, val); return kvm_complete_insn_gp(vcpu, err); case 3: WARN_ON_ONCE(enable_unrestricted_guest); err = kvm_set_cr3(vcpu, val); return kvm_complete_insn_gp(vcpu, err); case 4: err = handle_set_cr4(vcpu, val); return kvm_complete_insn_gp(vcpu, err); case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); u8 cr8 = (u8)val; err = kvm_set_cr8(vcpu, cr8); ret = kvm_complete_insn_gp(vcpu, err); if (lapic_in_kernel(vcpu)) return ret; if (cr8_prev <= cr8) return ret; /* * TODO: we might be squashing a * KVM_GUESTDBG_SINGLESTEP-triggered * KVM_EXIT_DEBUG here. */ vcpu->run->exit_reason = KVM_EXIT_SET_TPR; return 0; } } break; case 2: /* clts */ KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); return -EIO; case 1: /*mov from cr*/ switch (cr) { case 3: WARN_ON_ONCE(enable_unrestricted_guest); val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); return kvm_skip_emulated_instruction(vcpu); case 8: val = kvm_get_cr8(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); return kvm_skip_emulated_instruction(vcpu); } break; case 3: /* lmsw */ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); kvm_lmsw(vcpu, val); return kvm_skip_emulated_instruction(vcpu); default: break; } vcpu->run->exit_reason = 0; vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", (int)(exit_qualification >> 4) & 3, cr); return 0; } static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; int dr, dr7, reg; int err = 1; exit_qualification = vmx_get_exit_qual(vcpu); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; /* First, if DR does not exist, trigger UD */ if (!kvm_require_dr(vcpu, dr)) return 1; if (kvm_x86_ops.get_cpl(vcpu) > 0) goto out; dr7 = vmcs_readl(GUEST_DR7); if (dr7 & DR7_GD) { /* * As the vm-exit takes precedence over the debug trap, we * need to emulate the latter, either for the host or the * guest debugging itself. */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; vcpu->run->debug.arch.dr7 = dr7; vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); vcpu->run->debug.arch.exception = DB_VECTOR; vcpu->run->exit_reason = KVM_EXIT_DEBUG; return 0; } else { kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); return 1; } } if (vcpu->guest_debug == 0) { exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); /* * No more DR vmexits; force a reload of the debug registers * and reenter on this instruction. The next vmexit will * retrieve the full state of the debug registers. */ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; return 1; } reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { unsigned long val; kvm_get_dr(vcpu, dr, &val); kvm_register_write(vcpu, reg, val); err = 0; } else { err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); } out: return kvm_complete_insn_gp(vcpu, err); } static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { get_debugreg(vcpu->arch.db[0], 0); get_debugreg(vcpu->arch.db[1], 1); get_debugreg(vcpu->arch.db[2], 2); get_debugreg(vcpu->arch.db[3], 3); get_debugreg(vcpu->arch.dr6, 6); vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); /* * exc_debug expects dr6 to be cleared after it runs, avoid that it sees * a stale dr6 from the guest. */ set_debugreg(DR6_RESERVED, 6); } static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) { lockdep_assert_irqs_disabled(); set_debugreg(vcpu->arch.dr6, 6); } static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); } static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { kvm_apic_update_ppr(vcpu); return 1; } static int handle_interrupt_window(struct kvm_vcpu *vcpu) { exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); kvm_make_request(KVM_REQ_EVENT, vcpu); ++vcpu->stat.irq_window_exits; return 1; } static int handle_invlpg(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmx_get_exit_qual(vcpu); kvm_mmu_invlpg(vcpu, exit_qualification); return kvm_skip_emulated_instruction(vcpu); } static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int access_type, offset; access_type = exit_qualification & APIC_ACCESS_TYPE; offset = exit_qualification & APIC_ACCESS_OFFSET; /* * Sane guest uses MOV to write EOI, with written value * not cared. So make a short-circuit here by avoiding * heavy instruction emulation. */ if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && (offset == APIC_EOI)) { kvm_lapic_set_eoi(vcpu); return kvm_skip_emulated_instruction(vcpu); } } return kvm_emulate_instruction(vcpu, 0); } static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int vector = exit_qualification & 0xff; /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ kvm_apic_set_eoi_accelerated(vcpu, vector); return 1; } static int handle_apic_write(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 offset = exit_qualification & 0xfff; /* APIC-write VM exit is trap-like and thus no need to adjust IP */ kvm_apic_write_nodecode(vcpu, offset); return 1; } static int handle_task_switch(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; bool has_error_code = false; u32 error_code = 0; u16 tss_selector; int reason, type, idt_v, idt_index; idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); exit_qualification = vmx_get_exit_qual(vcpu); reason = (u32)exit_qualification >> 30; if (reason == TASK_SWITCH_GATE && idt_v) { switch (type) { case INTR_TYPE_NMI_INTR: vcpu->arch.nmi_injected = false; vmx_set_nmi_mask(vcpu, true); break; case INTR_TYPE_EXT_INTR: case INTR_TYPE_SOFT_INTR: kvm_clear_interrupt_queue(vcpu); break; case INTR_TYPE_HARD_EXCEPTION: if (vmx->idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { has_error_code = true; error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE); } fallthrough; case INTR_TYPE_SOFT_EXCEPTION: kvm_clear_exception_queue(vcpu); break; default: break; } } tss_selector = exit_qualification; if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && type != INTR_TYPE_EXT_INTR && type != INTR_TYPE_NMI_INTR)) WARN_ON(!skip_emulated_instruction(vcpu)); /* * TODO: What about debug traps on tss switch? * Are we supposed to inject them and update dr6? */ return kvm_task_switch(vcpu, tss_selector, type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, has_error_code, error_code); } static int handle_ept_violation(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; gpa_t gpa; u64 error_code; exit_qualification = vmx_get_exit_qual(vcpu); /* * EPT violation happened while executing iret from NMI, * "blocked by NMI" bit has to be set before next VM entry. * There are errata that may cause this bit to not be set: * AAK134, BY25. */ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && enable_vnmi && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); /* Is it a read fault? */ error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) ? PFERR_USER_MASK : 0; /* Is it a write fault? */ error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) ? PFERR_WRITE_MASK : 0; /* Is it a fetch fault? */ error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) ? PFERR_FETCH_MASK : 0; /* ept page table entry is present? */ error_code |= (exit_qualification & (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE | EPT_VIOLATION_EXECUTABLE)) ? PFERR_PRESENT_MASK : 0; error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ? PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; vcpu->arch.exit_qualification = exit_qualification; /* * Check that the GPA doesn't exceed physical memory limits, as that is * a guest page fault. We have to emulate the instruction here, because * if the illegal address is that of a paging structure, then * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we * would also use advanced VM-exit information for EPT violations to * reconstruct the page fault error code. */ if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0); return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { gpa_t gpa; if (!vmx_can_emulate_instruction(vcpu, NULL, 0)) return 1; /* * A nested guest cannot optimize MMIO vmexits, because we have an * nGPA here instead of the required GPA. */ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); if (!is_guest_mode(vcpu) && !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { trace_kvm_fast_mmio(gpa); return kvm_skip_emulated_instruction(vcpu); } return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); } static int handle_nmi_window(struct kvm_vcpu *vcpu) { if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) return -EIO; exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); ++vcpu->stat.nmi_window_exits; kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool intr_window_requested; unsigned count = 130; intr_window_requested = exec_controls_get(vmx) & CPU_BASED_INTR_WINDOW_EXITING; while (vmx->emulation_required && count-- != 0) { if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) return handle_interrupt_window(&vmx->vcpu); if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; if (!kvm_emulate_instruction(vcpu, 0)) return 0; if (vmx->emulation_required && !vmx->rmode.vm86_active && vcpu->arch.exception.pending) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; return 0; } if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; return kvm_vcpu_halt(vcpu); } /* * Note, return 1 and not 0, vcpu_run() will invoke * xfer_to_guest_mode() which will create a proper return * code. */ if (__xfer_to_guest_mode_work_pending()) return 1; } return 1; } static void grow_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned int old = vmx->ple_window; vmx->ple_window = __grow_ple_window(old, ple_window, ple_window_grow, ple_window_max); if (vmx->ple_window != old) { vmx->ple_window_dirty = true; trace_kvm_ple_window_update(vcpu->vcpu_id, vmx->ple_window, old); } } static void shrink_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned int old = vmx->ple_window; vmx->ple_window = __shrink_ple_window(old, ple_window, ple_window_shrink, ple_window); if (vmx->ple_window != old) { vmx->ple_window_dirty = true; trace_kvm_ple_window_update(vcpu->vcpu_id, vmx->ple_window, old); } } /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. */ static int handle_pause(struct kvm_vcpu *vcpu) { if (!kvm_pause_in_guest(vcpu->kvm)) grow_ple_window(vcpu); /* * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" * VM-execution control is ignored if CPL > 0. OTOH, KVM * never set PAUSE_EXITING and just set PLE if supported, * so the vcpu must be CPL=0 if it gets a PAUSE exit. */ kvm_vcpu_on_spin(vcpu, true); return kvm_skip_emulated_instruction(vcpu); } static int handle_monitor_trap(struct kvm_vcpu *vcpu) { return 1; } static int handle_invpcid(struct kvm_vcpu *vcpu) { u32 vmx_instruction_info; unsigned long type; gva_t gva; struct { u64 pcid; u64 gla; } operand; if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); if (type > 3) { kvm_inject_gp(vcpu, 0); return 1; } /* According to the Intel instruction reference, the memory operand * is read even if it isn't needed (e.g., for type==all) */ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; return kvm_handle_invpcid(vcpu, type, gva); } static int handle_pml_full(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; trace_kvm_pml_full(vcpu->vcpu_id); exit_qualification = vmx_get_exit_qual(vcpu); /* * PML buffer FULL happened while executing iret from NMI, * "blocked by NMI" bit has to be set before next VM entry. */ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && enable_vnmi && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); /* * PML buffer already flushed at beginning of VMEXIT. Nothing to do * here.., and there's no userspace involvement needed for PML. */ return 1; } static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (!vmx->req_immediate_exit && !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) { kvm_lapic_expired_hv_timer(vcpu); return EXIT_FASTPATH_REENTER_GUEST; } return EXIT_FASTPATH_NONE; } static int handle_preemption_timer(struct kvm_vcpu *vcpu) { handle_fastpath_preemption_timer(vcpu); return 1; } /* * When nested=0, all VMX instruction VM Exits filter here. The handlers * are overwritten by nested_vmx_setup() when nested=1. */ static int handle_vmx_instruction(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } #ifndef CONFIG_X86_SGX_KVM static int handle_encls(struct kvm_vcpu *vcpu) { /* * SGX virtualization is disabled. There is no software enable bit for * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent * the guest from executing ENCLS (when SGX is supported by hardware). */ kvm_queue_exception(vcpu, UD_VECTOR); return 1; } #endif /* CONFIG_X86_SGX_KVM */ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) { /* * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK * VM-Exits. Unconditionally set the flag here and leave the handling to * vmx_handle_exit(). */ to_vmx(vcpu)->exit_reason.bus_lock_detected = true; return 1; } /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs * to be done to userspace and return 0. */ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, [EXIT_REASON_IO_INSTRUCTION] = handle_io, [EXIT_REASON_CR_ACCESS] = handle_cr, [EXIT_REASON_DR_ACCESS] = handle_dr, [EXIT_REASON_CPUID] = kvm_emulate_cpuid, [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, [EXIT_REASON_HLT] = kvm_emulate_halt, [EXIT_REASON_INVD] = kvm_emulate_invd, [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, [EXIT_REASON_VMPTRST] = handle_vmx_instruction, [EXIT_REASON_VMREAD] = handle_vmx_instruction, [EXIT_REASON_VMRESUME] = handle_vmx_instruction, [EXIT_REASON_VMWRITE] = handle_vmx_instruction, [EXIT_REASON_VMOFF] = handle_vmx_instruction, [EXIT_REASON_VMON] = handle_vmx_instruction, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_APIC_WRITE] = handle_apic_write, [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, [EXIT_REASON_GDTR_IDTR] = handle_desc, [EXIT_REASON_LDTR_TR] = handle_desc, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, [EXIT_REASON_INVEPT] = handle_vmx_instruction, [EXIT_REASON_INVVPID] = handle_vmx_instruction, [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmx_instruction, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, }; static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); *info1 = vmx_get_exit_qual(vcpu); if (!(vmx->exit_reason.failed_vmentry)) { *info2 = vmx->idt_vectoring_info; *intr_info = vmx_get_intr_info(vcpu); if (is_exception_with_error_code(*intr_info)) *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); else *error_code = 0; } else { *info2 = 0; *intr_info = 0; *error_code = 0; } } static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) { if (vmx->pml_pg) { __free_page(vmx->pml_pg); vmx->pml_pg = NULL; } } static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 *pml_buf; u16 pml_idx; pml_idx = vmcs_read16(GUEST_PML_INDEX); /* Do nothing if PML buffer is empty */ if (pml_idx == (PML_ENTITY_NUM - 1)) return; /* PML index always points to next available PML buffer entity */ if (pml_idx >= PML_ENTITY_NUM) pml_idx = 0; else pml_idx++; pml_buf = page_address(vmx->pml_pg); for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { u64 gpa; gpa = pml_buf[pml_idx]; WARN_ON(gpa & (PAGE_SIZE - 1)); kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); } /* reset PML index */ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } static void vmx_dump_sel(char *name, uint32_t sel) { pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", name, vmcs_read16(sel), vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); } static void vmx_dump_dtsel(char *name, uint32_t limit) { pr_err("%s limit=0x%08x, base=0x%016lx\n", name, vmcs_read32(limit), vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); } static void vmx_dump_msrs(char *name, struct vmx_msrs *m) { unsigned int i; struct vmx_msr_entry *e; pr_err("MSR %s:\n", name); for (i = 0, e = m->val; i < m->nr; ++i, ++e) pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); } void dump_vmcs(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmentry_ctl, vmexit_ctl; u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; unsigned long cr4; int efer_slot; if (!dump_invalid_vmcs) { pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); return; } vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); cr4 = vmcs_readl(GUEST_CR4); secondary_exec_control = 0; if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); pr_err("*** Guest State ***\n"); pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), vmcs_readl(CR0_GUEST_HOST_MASK)); pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); if (cpu_has_vmx_ept()) { pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); } pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", vmcs_readl(GUEST_SYSENTER_ESP), vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); else if (efer_slot >= 0) pr_err("EFER= 0x%016llx (autoload)\n", vmx->msr_autoload.guest.val[efer_slot].value); else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) pr_err("EFER= 0x%016llx (effective)\n", vcpu->arch.efer | (EFER_LMA | EFER_LME)); else pr_err("EFER= 0x%016llx (effective)\n", vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", vmcs_read64(GUEST_IA32_DEBUGCTL), vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); if (cpu_has_load_perf_global_ctrl() && vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) pr_err("PerfGlobCtl = 0x%016llx\n", vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); pr_err("Interruptibility = %08x ActivityState = %08x\n", vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), vmcs_read32(GUEST_ACTIVITY_STATE)); if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) pr_err("InterruptStatus = %04x\n", vmcs_read16(GUEST_INTR_STATUS)); if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); pr_err("*** Host State ***\n"); pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), vmcs_read16(HOST_TR_SELECTOR)); pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), vmcs_readl(HOST_TR_BASE)); pr_err("GDTBase=%016lx IDTBase=%016lx\n", vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), vmcs_readl(HOST_CR4)); pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", vmcs_readl(HOST_IA32_SYSENTER_ESP), vmcs_read32(HOST_IA32_SYSENTER_CS), vmcs_readl(HOST_IA32_SYSENTER_EIP)); if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); if (cpu_has_load_perf_global_ctrl() && vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) pr_err("PerfGlobCtl = 0x%016llx\n", vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); pr_err("*** Control State ***\n"); pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", vmcs_read32(EXCEPTION_BITMAP), vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", vmcs_read32(VM_EXIT_INTR_INFO), vmcs_read32(VM_EXIT_INTR_ERROR_CODE), vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); pr_err(" reason=%08x qualification=%016lx\n", vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); pr_err("IDTVectoring: info=%08x errcode=%08x\n", vmcs_read32(IDT_VECTORING_INFO_FIELD), vmcs_read32(IDT_VECTORING_ERROR_CODE)); pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) pr_err("TSC Multiplier = 0x%016llx\n", vmcs_read64(TSC_MULTIPLIER)); if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { u16 status = vmcs_read16(GUEST_INTR_STATUS); pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); } pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); } if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) pr_err("PLE Gap=%08x Window=%08x\n", vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) pr_err("Virtual processor ID = 0x%04x\n", vmcs_read16(VIRTUAL_PROCESSOR_ID)); } /* * The guest has exited. See if we can fix it or if we need userspace * assistance. */ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); union vmx_exit_reason exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; u16 exit_handler_index; /* * Flush logged GPAs PML buffer, this will make dirty_bitmap more * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before * querying dirty_bitmap, we only need to kick all vcpus out of guest * mode as if vcpus is in root mode, the PML buffer must has been * flushed already. Note, PML is never enabled in hardware while * running L2. */ if (enable_pml && !is_guest_mode(vcpu)) vmx_flush_pml_buffer(vcpu); /* * KVM should never reach this point with a pending nested VM-Enter. * More specifically, short-circuiting VM-Entry to emulate L2 due to * invalid guest state should never happen as that means KVM knowingly * allowed a nested VM-Enter with an invalid vmcs12. More below. */ if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) return -EIO; if (is_guest_mode(vcpu)) { /* * PML is never enabled when running L2, bail immediately if a * PML full exit occurs as something is horribly wrong. */ if (exit_reason.basic == EXIT_REASON_PML_FULL) goto unexpected_vmexit; /* * The host physical addresses of some pages of guest memory * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC * Page). The CPU may write to these pages via their host * physical address while L2 is running, bypassing any * address-translation-based dirty tracking (e.g. EPT write * protection). * * Mark them dirty on every exit from L2 to prevent them from * getting out of sync with dirty tracking. */ nested_mark_vmcs12_pages_dirty(vcpu); /* * Synthesize a triple fault if L2 state is invalid. In normal * operation, nested VM-Enter rejects any attempt to enter L2 * with invalid state. However, those checks are skipped if * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If * L2 state is invalid, it means either L1 modified SMRAM state * or userspace provided bad state. Synthesize TRIPLE_FAULT as * doing so is architecturally allowed in the RSM case, and is * the least awful solution for the userspace case without * risking false positives. */ if (vmx->emulation_required) { nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); return 1; } if (nested_vmx_reflect_vmexit(vcpu)) return 1; } /* If guest state is invalid, start emulating. L2 is handled above. */ if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); if (exit_reason.failed_vmentry) { dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full; vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; return 0; } if (unlikely(vmx->fail)) { dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; return 0; } /* * Note: * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by * delivery event since it indicates guest is accessing MMIO. * The vm-exit can be triggered again after return to guest that * will cause infinite loop. */ if ((vectoring_info & VECTORING_INFO_VALID_MASK) && (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && exit_reason.basic != EXIT_REASON_EPT_VIOLATION && exit_reason.basic != EXIT_REASON_PML_FULL && exit_reason.basic != EXIT_REASON_APIC_ACCESS && exit_reason.basic != EXIT_REASON_TASK_SWITCH)) { int ndata = 3; vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; vcpu->run->internal.data[0] = vectoring_info; vcpu->run->internal.data[1] = exit_reason.full; vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { vcpu->run->internal.data[ndata++] = vmcs_read64(GUEST_PHYSICAL_ADDRESS); } vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu; vcpu->run->internal.ndata = ndata; return 0; } if (unlikely(!enable_vnmi && vmx->loaded_vmcs->soft_vnmi_blocked)) { if (!vmx_interrupt_blocked(vcpu)) { vmx->loaded_vmcs->soft_vnmi_blocked = 0; } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && vcpu->arch.nmi_pending) { /* * This CPU don't support us in finding the end of an * NMI-blocked window if the guest runs with IRQs * disabled. So we pull the trigger after 1 s of * futile waiting, but inform the user about this. */ printk(KERN_WARNING "%s: Breaking out of NMI-blocked " "state on VCPU %d after 1 s timeout\n", __func__, vcpu->vcpu_id); vmx->loaded_vmcs->soft_vnmi_blocked = 0; } } if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; if (exit_reason.basic >= kvm_vmx_max_exit_handlers) goto unexpected_vmexit; #ifdef CONFIG_RETPOLINE if (exit_reason.basic == EXIT_REASON_MSR_WRITE) return kvm_emulate_wrmsr(vcpu); else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) return handle_preemption_timer(vcpu); else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) return handle_interrupt_window(vcpu); else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) return handle_external_interrupt(vcpu); else if (exit_reason.basic == EXIT_REASON_HLT) return kvm_emulate_halt(vcpu); else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) return handle_ept_misconfig(vcpu); #endif exit_handler_index = array_index_nospec((u16)exit_reason.basic, kvm_vmx_max_exit_handlers); if (!kvm_vmx_exit_handlers[exit_handler_index]) goto unexpected_vmexit; return kvm_vmx_exit_handlers[exit_handler_index](vcpu); unexpected_vmexit: vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason.full); dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_reason.full; vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; return 0; } static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { int ret = __vmx_handle_exit(vcpu, exit_fastpath); /* * Exit to user space when bus lock detected to inform that there is * a bus lock in guest. */ if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { if (ret > 0) vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; return 0; } return ret; } /* * Software based L1D cache flush which is used when microcode providing * the cache control MSR is not loaded. * * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to * flush it is required to read in 64 KiB because the replacement algorithm * is not exactly LRU. This could be sized at runtime via topology * information but as all relevant affected CPUs have 32KiB L1D cache size * there is no point in doing so. */ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) { int size = PAGE_SIZE << L1D_CACHE_ORDER; /* * This code is only executed when the the flush mode is 'cond' or * 'always' */ if (static_branch_likely(&vmx_l1d_flush_cond)) { bool flush_l1d; /* * Clear the per-vcpu flush bit, it gets set again * either from vcpu_run() or from one of the unsafe * VMEXIT handlers. */ flush_l1d = vcpu->arch.l1tf_flush_l1d; vcpu->arch.l1tf_flush_l1d = false; /* * Clear the per-cpu flush bit, it gets set again from * the interrupt handlers. */ flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); kvm_clear_cpu_l1tf_flush_l1d(); if (!flush_l1d) return; } vcpu->stat.l1d_flush++; if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); return; } asm volatile( /* First ensure the pages are in the TLB */ "xorl %%eax, %%eax\n" ".Lpopulate_tlb:\n\t" "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" "addl $4096, %%eax\n\t" "cmpl %%eax, %[size]\n\t" "jne .Lpopulate_tlb\n\t" "xorl %%eax, %%eax\n\t" "cpuid\n\t" /* Now fill the cache */ "xorl %%eax, %%eax\n" ".Lfill_cache:\n" "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" "addl $64, %%eax\n\t" "cmpl %%eax, %[size]\n\t" "jne .Lfill_cache\n\t" "lfence\n" :: [flush_pages] "r" (vmx_l1d_flush_pages), [size] "r" (size) : "eax", "ebx", "ecx", "edx"); } static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); int tpr_threshold; if (is_guest_mode(vcpu) && nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) return; tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; if (is_guest_mode(vcpu)) to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold; else vmcs_write32(TPR_THRESHOLD, tpr_threshold); } void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 sec_exec_control; if (!lapic_in_kernel(vcpu)) return; if (!flexpriority_enabled && !cpu_has_vmx_virtualize_x2apic_mode()) return; /* Postpone execution until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { vmx->nested.change_vmcs01_virtual_apic_mode = true; return; } sec_exec_control = secondary_exec_controls_get(vmx); sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); switch (kvm_get_apic_mode(vcpu)) { case LAPIC_MODE_INVALID: WARN_ONCE(true, "Invalid local APIC state"); break; case LAPIC_MODE_DISABLED: break; case LAPIC_MODE_XAPIC: if (flexpriority_enabled) { sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); /* * Flush the TLB, reloading the APIC access page will * only do so if its physical address has changed, but * the guest may have inserted a non-APIC mapping into * the TLB while the APIC access page was disabled. */ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } break; case LAPIC_MODE_X2APIC: if (cpu_has_vmx_virtualize_x2apic_mode()) sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; break; } secondary_exec_controls_set(vmx, sec_exec_control); vmx_update_msr_bitmap_x2apic(vcpu); } static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) { struct page *page; /* Defer reload until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; return; } if (!(secondary_exec_controls_get(to_vmx(vcpu)) & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) return; page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (is_error_page(page)) return; vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page)); vmx_flush_tlb_current(vcpu); /* * Do not pin apic access page in memory, the MMU notifier * will call us again if it is migrated or swapped out. */ put_page(page); } static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) { u16 status; u8 old; if (max_isr == -1) max_isr = 0; status = vmcs_read16(GUEST_INTR_STATUS); old = status >> 8; if (max_isr != old) { status &= 0xff; status |= max_isr << 8; vmcs_write16(GUEST_INTR_STATUS, status); } } static void vmx_set_rvi(int vector) { u16 status; u8 old; if (vector == -1) vector = 0; status = vmcs_read16(GUEST_INTR_STATUS); old = (u8)status & 0xff; if ((u8)vector != old) { status &= ~0xff; status |= (u8)vector; vmcs_write16(GUEST_INTR_STATUS, status); } } static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) { /* * When running L2, updating RVI is only relevant when * vmcs12 virtual-interrupt-delivery enabled. * However, it can be enabled only when L1 also * intercepts external-interrupts and in that case * we should not update vmcs02 RVI but instead intercept * interrupt. Therefore, do nothing when running L2. */ if (!is_guest_mode(vcpu)) vmx_set_rvi(max_irr); } static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; bool got_posted_interrupt; if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; if (pi_test_on(&vmx->pi_desc)) { pi_clear_on(&vmx->pi_desc); /* * IOMMU can write to PID.ON, so the barrier matters even on UP. * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); got_posted_interrupt = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); got_posted_interrupt = false; } /* * Newly recognized interrupts are injected via either virtual interrupt * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is * disabled in two cases: * * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected * into L2, but KVM doesn't use virtual interrupt delivery to inject * interrupts into L2, and so KVM_REQ_EVENT is again needed. * * 2) If APICv is disabled for this vCPU, assigned devices may still * attempt to post interrupts. The posted interrupt vector will cause * a VM-Exit and the subsequent entry will call sync_pir_to_irr. */ if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) vmx_set_rvi(max_irr); else if (got_posted_interrupt) kvm_make_request(KVM_REQ_EVENT, vcpu); return max_irr; } static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) return; vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); pi_clear_on(&vmx->pi_desc); memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); } void vmx_do_interrupt_nmi_irqoff(unsigned long entry); static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, unsigned long entry) { kvm_before_interrupt(vcpu); vmx_do_interrupt_nmi_irqoff(entry); kvm_after_interrupt(vcpu); } static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist; u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ if (is_page_fault(intr_info)) vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); /* Handle machine checks before interrupts are enabled */ else if (is_machine_check(intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ else if (is_nmi(intr_info)) handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry); } static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) { u32 intr_info = vmx_get_intr_info(vcpu); unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; gate_desc *desc = (gate_desc *)host_idt_base + vector; if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); vcpu->arch.at_instruction_boundary = true; } static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (vmx->emulation_required) return; if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) handle_exception_nmi_irqoff(vmx); } /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. */ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_SMBASE: /* * We cannot do SMM unless we can run the guest in big * real mode. */ return enable_unrestricted_guest || emulate_invalid_guest_state; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: return nested; case MSR_AMD64_VIRT_SPEC_CTRL: /* This is AMD only. */ return false; default: return true; } } static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; bool unblock_nmi; u8 vector; bool idtv_info_valid; idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; if (enable_vnmi) { if (vmx->loaded_vmcs->nmi_known_unmasked) return; exit_intr_info = vmx_get_intr_info(&vmx->vcpu); unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* * SDM 3: 27.7.1.2 (September 2008) * Re-set bit "block by NMI" before VM entry if vmexit caused by * a guest IRET fault. * SDM 3: 23.2.2 (September 2008) * Bit 12 is undefined in any of the following cases: * If the VM exit sets the valid bit in the IDT-vectoring * information field. * If the VM exit is due to a double fault. */ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && vector != DF_VECTOR && !idtv_info_valid) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); else vmx->loaded_vmcs->nmi_known_unmasked = !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI); } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) vmx->loaded_vmcs->vnmi_blocked_time += ktime_to_ns(ktime_sub(ktime_get(), vmx->loaded_vmcs->entry_time)); } static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, u32 idt_vectoring_info, int instr_len_field, int error_code_field) { u8 vector; int type; bool idtv_info_valid; idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; vcpu->arch.nmi_injected = false; kvm_clear_exception_queue(vcpu); kvm_clear_interrupt_queue(vcpu); if (!idtv_info_valid) return; kvm_make_request(KVM_REQ_EVENT, vcpu); vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; switch (type) { case INTR_TYPE_NMI_INTR: vcpu->arch.nmi_injected = true; /* * SDM 3: 27.7.1.2 (September 2008) * Clear bit "block by NMI" before VM entry if a NMI * delivery faulted. */ vmx_set_nmi_mask(vcpu, false); break; case INTR_TYPE_SOFT_EXCEPTION: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); fallthrough; case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { u32 err = vmcs_read32(error_code_field); kvm_requeue_exception_e(vcpu, vector, err); } else kvm_requeue_exception(vcpu, vector); break; case INTR_TYPE_SOFT_INTR: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); fallthrough; case INTR_TYPE_EXT_INTR: kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); break; default: break; } } static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, VM_EXIT_INSTRUCTION_LEN, IDT_VECTORING_ERROR_CODE); } static void vmx_cancel_injection(struct kvm_vcpu *vcpu) { __vmx_complete_interrupts(vcpu, vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), VM_ENTRY_INSTRUCTION_LEN, VM_ENTRY_EXCEPTION_ERROR_CODE); vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) { int i, nr_msrs; struct perf_guest_switch_msr *msrs; /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ msrs = perf_guest_get_msrs(&nr_msrs); if (!msrs) return; for (i = 0; i < nr_msrs; i++) if (msrs[i].host == msrs[i].guest) clear_atomic_switch_msr(vmx, msrs[i].msr); else add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, msrs[i].host, false); } static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tscl; u32 delta_tsc; if (vmx->req_immediate_exit) { vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); vmx->loaded_vmcs->hv_timer_soft_disabled = false; } else if (vmx->hv_deadline_tsc != -1) { tscl = rdtsc(); if (vmx->hv_deadline_tsc > tscl) /* set_hv_timer ensures the delta fits in 32-bits */ delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> cpu_preemption_timer_multi); else delta_tsc = 0; vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); vmx->loaded_vmcs->hv_timer_soft_disabled = false; } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); vmx->loaded_vmcs->hv_timer_soft_disabled = true; } } void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) { if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { vmx->loaded_vmcs->host_state.rsp = host_rsp; vmcs_writel(HOST_RSP, host_rsp); } } void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags) { u64 hostval = this_cpu_read(x86_spec_ctrl_current); if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) return; if (flags & VMX_RUN_SAVE_SPEC_CTRL) vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); /* * If the guest/host SPEC_CTRL values differ, restore the host value. * * For legacy IBRS, the IBRS bit always needs to be written after * transitioning from a less privileged predictor mode, regardless of * whether the guest/host values differ. */ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || vmx->spec_ctrl != hostval) native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); barrier_nospec(); } static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { switch (to_vmx(vcpu)->exit_reason.basic) { case EXIT_REASON_MSR_WRITE: return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: return handle_fastpath_preemption_timer(vcpu); default: return EXIT_FASTPATH_NONE; } } static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx, unsigned long flags) { kvm_guest_enter_irqoff(); /* * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW * mitigation for MDS is done late in VMentry and is still * executed in spite of L1D Flush. This is because an extra VERW * should not matter much after the big hammer L1D Flush. */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); else if (static_branch_unlikely(&mmio_stale_data_clear) && kvm_arch_has_assigned_device(vcpu->kvm)) mds_clear_cpu_buffers(); vmx_disable_fb_clear(vmx); if (vcpu->arch.cr2 != native_read_cr2()) native_write_cr2(vcpu->arch.cr2); vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, flags); vcpu->arch.cr2 = native_read_cr2(); vmx_enable_fb_clear(vmx); kvm_guest_exit_irqoff(); } static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && vmx->loaded_vmcs->soft_vnmi_blocked)) vmx->loaded_vmcs->entry_time = ktime_get(); /* * Don't enter VMX if guest state is invalid, let the exit handler * start emulation until we arrive back to a valid state. Synthesize a * consistency check VM-Exit due to invalid guest state and bail. */ if (unlikely(vmx->emulation_required)) { vmx->fail = 0; vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; vmx->exit_reason.failed_vmentry = 1; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); vmx->exit_qualification = ENTRY_FAIL_DEFAULT; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); vmx->exit_intr_info = 0; return EXIT_FASTPATH_NONE; } trace_kvm_entry(vcpu); if (vmx->ple_window_dirty) { vmx->ple_window_dirty = false; vmcs_write32(PLE_WINDOW, vmx->ple_window); } /* * We did this in prepare_switch_to_guest, because it needs to * be within srcu_read_lock. */ WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync); if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); cr3 = __get_current_cr3_fast(); if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { vmcs_writel(HOST_CR3, cr3); vmx->loaded_vmcs->host_state.cr3 = cr3; } cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); vmx->loaded_vmcs->host_state.cr4 = cr4; } /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug * exceptions being set, but that's not correct for the guest debugging * case. */ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); kvm_load_guest_xsave_state(vcpu); pt_guest_enter(vmx); atomic_switch_perf_msrs(vmx); if (intel_pmu_lbr_is_enabled(vcpu)) vmx_passthrough_lbr_msrs(vcpu); if (enable_preemption_timer) vmx_update_hv_timer(vcpu); kvm_wait_lapic_expire(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if * it's non-zero. Since vmentry is serialising on affected CPUs, there * is no need to worry about the conditional branch over the wrmsr * being speculatively taken. */ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); /* The actual VMENTER/EXIT is in the .noinstr.text section. */ vmx_vcpu_enter_exit(vcpu, vmx, __vmx_vcpu_run_flags(vmx)); /* All fields are clean at this point */ if (static_branch_unlikely(&enable_evmcs)) { current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); } /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (vmx->host_debugctlmsr) update_debugctlmsr(vmx->host_debugctlmsr); #ifndef CONFIG_X86_64 /* * The sysexit path does not restore ds/es, so we must set them to * a reasonable value ourselves. * * We can't defer this to vmx_prepare_switch_to_host() since that * function may be executed in interrupt context, which saves and * restore segments around it, nullifying its effect. */ loadsegment(ds, __USER_DS); loadsegment(es, __USER_DS); #endif vmx_register_cache_reset(vcpu); pt_guest_exit(vmx); kvm_load_host_xsave_state(vcpu); if (is_guest_mode(vcpu)) { /* * Track VMLAUNCH/VMRESUME that have made past guest state * checking. */ if (vmx->nested.nested_run_pending && !vmx->exit_reason.failed_vmentry) ++vcpu->stat.nested_run; vmx->nested.nested_run_pending = 0; } vmx->idt_vectoring_info = 0; if (unlikely(vmx->fail)) { vmx->exit_reason.full = 0xdead; return EXIT_FASTPATH_NONE; } vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); if (likely(!vmx->exit_reason.failed_vmentry)) vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX); if (unlikely(vmx->exit_reason.failed_vmentry)) return EXIT_FASTPATH_NONE; vmx->loaded_vmcs->launched = 1; vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); if (is_guest_mode(vcpu)) return EXIT_FASTPATH_NONE; return vmx_exit_handlers_fastpath(vcpu); } static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (enable_pml) vmx_destroy_pml_buffer(vmx); free_vpid(vmx->vpid); nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); } static int vmx_create_vcpu(struct kvm_vcpu *vcpu) { struct vmx_uret_msr *tsx_ctrl; struct vcpu_vmx *vmx; int i, cpu, err; BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); err = -ENOMEM; vmx->vpid = allocate_vpid(); /* * If PML is turned on, failure on enabling PML just results in failure * of creating the vcpu, therefore we can simplify PML logic (by * avoiding dealing with cases, such as enabling PML partially on vcpus * for the guest), etc. */ if (enable_pml) { vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!vmx->pml_pg) goto free_vpid; } for (i = 0; i < kvm_nr_uret_msrs; ++i) { vmx->guest_uret_msrs[i].data = 0; vmx->guest_uret_msrs[i].mask = -1ull; } if (boot_cpu_has(X86_FEATURE_RTM)) { /* * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. * Keep the host value unchanged to avoid changing CPUID bits * under the host kernel's feet. */ tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (tsx_ctrl) tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; } err = alloc_loaded_vmcs(&vmx->vmcs01); if (err < 0) goto free_pml; /* * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the * feature only for vmcs01, KVM currently isn't equipped to realize any * performance benefits from enabling it for vmcs02. */ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) && (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; evmcs->hv_enlightenments_control.msr_bitmap = 1; } /* The MSR bitmap starts with all ones */ bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); #ifdef CONFIG_X86_64 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); #endif vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); if (kvm_cstate_in_guest(vcpu->kvm)) { vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); } vmx->loaded_vmcs = &vmx->vmcs01; cpu = get_cpu(); vmx_vcpu_load(vcpu, cpu); vcpu->cpu = cpu; init_vmcs(vmx); vmx_vcpu_put(vcpu); put_cpu(); if (cpu_need_virtualize_apic_accesses(vcpu)) { err = alloc_apic_access_page(vcpu->kvm); if (err) goto free_vmcs; } if (enable_ept && !enable_unrestricted_guest) { err = init_rmode_identity_map(vcpu->kvm); if (err) goto free_vmcs; } if (nested) memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); else memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); vcpu_setup_sgx_lepubkeyhash(vcpu); vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; vcpu->arch.microcode_version = 0x100000000ULL; vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; /* * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR * or POSTED_INTR_WAKEUP_VECTOR. */ vmx->pi_desc.nv = POSTED_INTR_VECTOR; vmx->pi_desc.sn = 1; return 0; free_vmcs: free_loaded_vmcs(vmx->loaded_vmcs); free_pml: vmx_destroy_pml_buffer(vmx); free_vpid: free_vpid(vmx->vpid); return err; } #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" static int vmx_vm_init(struct kvm *kvm) { if (!ple_gap) kvm->arch.pause_in_guest = true; if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { switch (l1tf_mitigation) { case L1TF_MITIGATION_OFF: case L1TF_MITIGATION_FLUSH_NOWARN: /* 'I explicitly don't care' is set */ break; case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: /* * Warn upon starting the first VM in a potentially * insecure environment. */ if (sched_smt_active()) pr_warn_once(L1TF_MSG_SMT); if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) pr_warn_once(L1TF_MSG_L1D); break; case L1TF_MITIGATION_FULL_FORCE: /* Flush is enforced */ break; } } return 0; } static int __init vmx_check_processor_compat(void) { struct vmcs_config vmcs_conf; struct vmx_capability vmx_cap; if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || !this_cpu_has(X86_FEATURE_VMX)) { pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id()); return -EIO; } if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) return -EIO; if (nested) nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept); if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", smp_processor_id()); return -EIO; } return 0; } static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; u64 ipat = 0; /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in * memory aliases with conflicting memory types and sometimes MCEs. * We have to be careful as to what are honored and when. * * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to * UC. The effective memory type is UC or WC depending on guest PAT. * This was historically the source of MCEs and we want to be * conservative. * * When there is no need to deal with noncoherent DMA (e.g., no VT-d * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The * EPT memory type is set to WB. The effective memory type is forced * WB. * * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The * EPT memory type is used to emulate guest CD/MTRR. */ if (is_mmio) { cache = MTRR_TYPE_UNCACHABLE; goto exit; } if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { ipat = VMX_EPT_IPAT_BIT; cache = MTRR_TYPE_WRBACK; goto exit; } if (kvm_read_cr0(vcpu) & X86_CR0_CD) { ipat = VMX_EPT_IPAT_BIT; if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) cache = MTRR_TYPE_WRBACK; else cache = MTRR_TYPE_UNCACHABLE; goto exit; } cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); exit: return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; } static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) { /* * These bits in the secondary execution controls field * are dynamic, the others are mostly based on the hypervisor * architecture and the guest's CPUID. Do not touch the * dynamic bits. */ u32 mask = SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_DESC; u32 cur_ctl = secondary_exec_controls_get(vmx); secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); } /* * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits * (indicating "allowed-1") if they are supported in the guest's CPUID. */ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_cpuid_entry2 *entry; vmx->nested.msrs.cr0_fixed1 = 0xffffffff; vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ if (entry && (entry->_reg & (_cpuid_mask))) \ vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ } while (0) entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); #undef cr4_fixed1_update } static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (kvm_mpx_supported()) { bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); if (mpx_enabled) { vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; } else { vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; } } } static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_cpuid_entry2 *best = NULL; int i; for (i = 0; i < PT_CPUID_LEAVES; i++) { best = kvm_find_cpuid_entry(vcpu, 0x14, i); if (!best) return; vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; } /* Get the number of configurable Address Ranges for filtering */ vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_num_address_ranges); /* Initialize and clear the no dependency bits */ vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC); /* * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise * will inject an #GP */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; /* * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and * PSBFreq can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); /* * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and * MTCFreq can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE); /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | RTIT_CTL_PTW_EN); /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; /* unmask address range configure area */ for (i = 0; i < vmx->pt_desc.addr_range; i++) vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); } static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */ vcpu->arch.xsaves_enabled = false; vmx_setup_uret_msrs(vmx); if (cpu_has_secondary_exec_ctrls()) vmcs_set_secondary_exec_control(vmx, vmx_secondary_exec_control(vmx)); if (nested_vmx_allowed(vcpu)) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; else to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); if (nested_vmx_allowed(vcpu)) { nested_vmx_cr_fixed1_bits_update(vcpu); nested_vmx_entry_exit_ctls_update(vcpu); } if (boot_cpu_has(X86_FEATURE_INTEL_PT) && guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) update_intel_pt_cfg(vcpu); if (boot_cpu_has(X86_FEATURE_RTM)) { struct vmx_uret_msr *msr; msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (msr) { bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); } } set_cr4_guest_host_mask(vmx); vmx_write_encls_bitmap(vcpu, NULL); if (guest_cpuid_has(vcpu, X86_FEATURE_SGX)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; else vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_LC_ENABLED; else vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_LC_ENABLED; /* Refresh #PF interception to account for MAXPHYADDR changes. */ vmx_update_exception_bitmap(vcpu); } static __init void vmx_set_cpu_caps(void) { kvm_set_cpu_caps(); /* CPUID 0x1 */ if (nested) kvm_cpu_cap_set(X86_FEATURE_VMX); /* CPUID 0x7 */ if (kvm_mpx_supported()) kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); if (!cpu_has_vmx_invpcid()) kvm_cpu_cap_clear(X86_FEATURE_INVPCID); if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); kvm_cpu_cap_clear(X86_FEATURE_SGX1); kvm_cpu_cap_clear(X86_FEATURE_SGX2); } if (vmx_umip_emulated()) kvm_cpu_cap_set(X86_FEATURE_UMIP); /* CPUID 0xD.1 */ supported_xss = 0; if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); /* CPUID 0x80000001 and 0x7 (RDPID) */ if (!cpu_has_vmx_rdtscp()) { kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); kvm_cpu_cap_clear(X86_FEATURE_RDPID); } if (cpu_has_vmx_waitpkg()) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) { to_vmx(vcpu)->req_immediate_exit = true; } static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, struct x86_instruction_info *info) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned short port; bool intercept; int size; if (info->intercept == x86_intercept_in || info->intercept == x86_intercept_ins) { port = info->src_val; size = info->dst_bytes; } else { port = info->dst_val; size = info->src_bytes; } /* * If the 'use IO bitmaps' VM-execution control is 0, IO instruction * VM-exits depend on the 'unconditional IO exiting' VM-execution * control. * * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. */ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) intercept = nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); else intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; } static int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage, struct x86_exception *exception) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); switch (info->intercept) { /* * RDPID causes #UD if disabled through secondary execution controls. * Because it is marked as EmulateOnUD, we need to intercept it here. * Note, RDPID is hidden behind ENABLE_RDTSCP. */ case x86_intercept_rdpid: if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { exception->vector = UD_VECTOR; exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT; } break; case x86_intercept_in: case x86_intercept_ins: case x86_intercept_out: case x86_intercept_outs: return vmx_check_intercept_io(vcpu, info); case x86_intercept_lgdt: case x86_intercept_lidt: case x86_intercept_lldt: case x86_intercept_ltr: case x86_intercept_sgdt: case x86_intercept_sidt: case x86_intercept_sldt: case x86_intercept_str: if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) return X86EMUL_CONTINUE; /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ break; case x86_intercept_pause: /* * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides * with vanilla NOPs in the emulator. Apply the interception * check only to actual PAUSE instructions. Don't check * PAUSE-loop-exiting, software can't expect a given PAUSE to * exit, i.e. KVM is within its rights to allow L2 to execute * the PAUSE. */ if ((info->rep_prefix != REPE_PREFIX) || !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) return X86EMUL_CONTINUE; break; /* TODO: check more intercepts... */ default: break; } return X86EMUL_UNHANDLEABLE; } #ifdef CONFIG_X86_64 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ static inline int u64_shl_div_u64(u64 a, unsigned int shift, u64 divisor, u64 *result) { u64 low = a << shift, high = a >> (64 - shift); /* To avoid the overflow on divq */ if (high >= divisor) return 1; /* Low hold the result, high hold rem which is discarded */ asm("divq %2\n\t" : "=a" (low), "=d" (high) : "rm" (divisor), "0" (low), "1" (high)); *result = low; return 0; } static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, bool *expired) { struct vcpu_vmx *vmx; u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; vmx = to_vmx(vcpu); tscl = rdtsc(); guest_tscl = kvm_read_l1_tsc(vcpu, tscl); delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; lapic_timer_advance_cycles = nsec_to_cycles(vcpu, ktimer->timer_advance_ns); if (delta_tsc > lapic_timer_advance_cycles) delta_tsc -= lapic_timer_advance_cycles; else delta_tsc = 0; /* Convert to host delta tsc if tsc scaling is enabled */ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && delta_tsc && u64_shl_div_u64(delta_tsc, kvm_tsc_scaling_ratio_frac_bits, vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) return -ERANGE; /* * If the delta tsc can't fit in the 32 bit after the multi shift, * we can't use the preemption timer. * It's possible that it fits on later vmentries, but checking * on every vmentry is costly so we just use an hrtimer. */ if (delta_tsc >> (cpu_preemption_timer_multi + 32)) return -ERANGE; vmx->hv_deadline_tsc = tscl + delta_tsc; *expired = !delta_tsc; return 0; } static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) { to_vmx(vcpu)->hv_deadline_tsc = -1; } #endif static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) { if (!kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); } void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); if (is_guest_mode(vcpu)) { vmx->nested.update_vmcs01_cpu_dirty_logging = true; return; } /* * Note, cpu_dirty_logging_count can be changed concurrent with this * code, but in that case another update request will be made and so * the guest will never run with a stale PML value. */ if (vcpu->kvm->arch.cpu_dirty_logging_count) secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); else secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } static int vmx_pre_block(struct kvm_vcpu *vcpu) { if (pi_pre_block(vcpu)) return 1; return 0; } static void vmx_post_block(struct kvm_vcpu *vcpu) { pi_post_block(vcpu); } static void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= FEAT_CTL_LMCE_ENABLED; else to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_LMCE_ENABLED; } static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { /* we need a nested vmexit to enter SMM, postpone if run is pending */ if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY; return !is_smm(vcpu); } static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { struct vcpu_vmx *vmx = to_vmx(vcpu); vmx->nested.smm.guest_mode = is_guest_mode(vcpu); if (vmx->nested.smm.guest_mode) nested_vmx_vmexit(vcpu, -1, 0, 0); vmx->nested.smm.vmxon = vmx->nested.vmxon; vmx->nested.vmxon = false; vmx_clear_hlt(vcpu); return 0; } static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; if (vmx->nested.smm.vmxon) { vmx->nested.vmxon = true; vmx->nested.smm.vmxon = false; } if (vmx->nested.smm.guest_mode) { ret = nested_vmx_enter_non_root_mode(vcpu, false); if (ret) return ret; vmx->nested.nested_run_pending = 1; vmx->nested.smm.guest_mode = false; } return 0; } static void vmx_enable_smi_window(struct kvm_vcpu *vcpu) { /* RSM will cause a vmexit anyway. */ } static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); } static void vmx_migrate_timers(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) { struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; if (hrtimer_try_to_cancel(timer) == 1) hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); } } static void hardware_unsetup(void) { kvm_set_posted_intr_wakeup_handler(NULL); if (nested) nested_vmx_hardware_unsetup(); free_kvm_area(); } static bool vmx_check_apicv_inhibit_reasons(ulong bit) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | BIT(APICV_INHIBIT_REASON_HYPERV); return supported & BIT(bit); } static struct kvm_x86_ops vmx_x86_ops __initdata = { .hardware_unsetup = hardware_unsetup, .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, .cpu_has_accelerated_tpr = report_flexpriority, .has_emulated_msr = vmx_has_emulated_msr, .vm_size = sizeof(struct kvm_vmx), .vm_init = vmx_vm_init, .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, .vcpu_reset = vmx_vcpu_reset, .prepare_guest_switch = vmx_prepare_switch_to_guest, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, .update_exception_bitmap = vmx_update_exception_bitmap, .get_msr_feature = vmx_get_msr_feature, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, .get_segment_base = vmx_get_segment_base, .get_segment = vmx_get_segment, .set_segment = vmx_set_segment, .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, .is_valid_cr0 = vmx_is_valid_cr0, .set_cr0 = vmx_set_cr0, .is_valid_cr4 = vmx_is_valid_cr4, .set_cr4 = vmx_set_cr4, .set_efer = vmx_set_efer, .get_idt = vmx_get_idt, .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, .get_if_flag = vmx_get_if_flag, .tlb_flush_all = vmx_flush_tlb_all, .tlb_flush_current = vmx_flush_tlb_current, .tlb_flush_gva = vmx_flush_tlb_gva, .tlb_flush_guest = vmx_flush_tlb_guest, .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, .skip_emulated_instruction = vmx_skip_emulated_instruction, .update_emulated_instruction = vmx_update_emulated_instruction, .set_interrupt_shadow = vmx_set_interrupt_shadow, .get_interrupt_shadow = vmx_get_interrupt_shadow, .patch_hypercall = vmx_patch_hypercall, .set_irq = vmx_inject_irq, .set_nmi = vmx_inject_nmi, .queue_exception = vmx_queue_exception, .cancel_injection = vmx_cancel_injection, .interrupt_allowed = vmx_interrupt_allowed, .nmi_allowed = vmx_nmi_allowed, .get_nmi_mask = vmx_get_nmi_mask, .set_nmi_mask = vmx_set_nmi_mask, .enable_nmi_window = vmx_enable_nmi_window, .enable_irq_window = vmx_enable_irq_window, .update_cr8_intercept = vmx_update_cr8_intercept, .set_virtual_apic_mode = vmx_set_virtual_apic_mode, .set_apic_access_page_addr = vmx_set_apic_access_page_addr, .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, .load_eoi_exitmap = vmx_load_eoi_exitmap, .apicv_post_state_restore = vmx_apicv_post_state_restore, .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, .get_mt_mask = vmx_get_mt_mask, .get_exit_info = vmx_get_exit_info, .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, .get_l2_tsc_offset = vmx_get_l2_tsc_offset, .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, .write_tsc_offset = vmx_write_tsc_offset, .write_tsc_multiplier = vmx_write_tsc_multiplier, .load_mmu_pgd = vmx_load_mmu_pgd, .check_intercept = vmx_check_intercept, .handle_exit_irqoff = vmx_handle_exit_irqoff, .request_immediate_exit = vmx_request_immediate_exit, .sched_in = vmx_sched_in, .cpu_dirty_log_size = PML_ENTITY_NUM, .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, .pre_block = vmx_pre_block, .post_block = vmx_post_block, .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, .update_pi_irte = pi_update_irte, .start_assignment = vmx_pi_start_assignment, #ifdef CONFIG_X86_64 .set_hv_timer = vmx_set_hv_timer, .cancel_hv_timer = vmx_cancel_hv_timer, #endif .setup_mce = vmx_setup_mce, .smi_allowed = vmx_smi_allowed, .enter_smm = vmx_enter_smm, .leave_smm = vmx_leave_smm, .enable_smi_window = vmx_enable_smi_window, .can_emulate_instruction = vmx_can_emulate_instruction, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, .migrate_timers = vmx_migrate_timers, .msr_filter_changed = vmx_msr_filter_changed, .complete_emulated_msr = kvm_complete_insn_gp, .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, }; static __init void vmx_setup_user_return_msrs(void) { /* * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm * will emulate SYSCALL in legacy mode if the vendor string in guest * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To * support this emulation, MSR_STAR is included in the list for i386, * but is never loaded into hardware. MSR_CSTAR is also never loaded * into hardware and is here purely for emulation purposes. */ const u32 vmx_uret_msrs_list[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif MSR_EFER, MSR_TSC_AUX, MSR_STAR, MSR_IA32_TSX_CTRL, }; int i; BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) kvm_add_user_return_msr(vmx_uret_msrs_list[i]); } static __init int hardware_setup(void) { unsigned long host_bndcfgs; struct desc_ptr dt; int r, ept_lpage_level; store_idt(&dt); host_idt_base = dt.address; vmx_setup_user_return_msrs(); if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) return -EIO; if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); if (boot_cpu_has(X86_FEATURE_MPX)) { rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); } if (!cpu_has_vmx_mpx()) supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels() || !cpu_has_vmx_ept_mt_wb() || !cpu_has_vmx_invept_global()) enable_ept = 0; /* NX support is required for shadow paging. */ if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n"); return -EOPNOTSUPP; } if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) enable_ept_ad_bits = 0; if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) enable_unrestricted_guest = 0; if (!cpu_has_vmx_flexpriority()) flexpriority_enabled = 0; if (!cpu_has_virtual_nmis()) enable_vnmi = 0; #ifdef CONFIG_X86_SGX_KVM if (!cpu_has_vmx_encls_vmexit()) enable_sgx = false; #endif /* * set_apic_access_page_addr() is used to reload apic access * page upon invalidation. No need to do anything if not * using the APIC_ACCESS_ADDR VMCS field. */ if (!flexpriority_enabled) vmx_x86_ops.set_apic_access_page_addr = NULL; if (!cpu_has_vmx_tpr_shadow()) vmx_x86_ops.update_cr8_intercept = NULL; #if IS_ENABLED(CONFIG_HYPERV) if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH && enable_ept) { vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb; vmx_x86_ops.tlb_remote_flush_with_range = hv_remote_flush_tlb_with_range; } #endif if (!cpu_has_vmx_ple()) { ple_gap = 0; ple_window = 0; ple_window_grow = 0; ple_window_max = 0; ple_window_shrink = 0; } if (!cpu_has_vmx_apicv()) enable_apicv = 0; if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; if (cpu_has_vmx_tsc_scaling()) { kvm_has_tsc_control = true; kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; kvm_tsc_scaling_ratio_frac_bits = 48; } kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ if (enable_ept) kvm_mmu_set_ept_masks(enable_ept_ad_bits, cpu_has_vmx_ept_execute_only()); if (!enable_ept) ept_lpage_level = 0; else if (cpu_has_vmx_ept_1g_page()) ept_lpage_level = PG_LEVEL_1G; else if (cpu_has_vmx_ept_2m_page()) ept_lpage_level = PG_LEVEL_2M; else ept_lpage_level = PG_LEVEL_4K; kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT * and EPT A/D bit features are enabled -- PML depends on them to work. */ if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) enable_pml = 0; if (!enable_pml) vmx_x86_ops.cpu_dirty_log_size = 0; if (!cpu_has_vmx_preemption_timer()) enable_preemption_timer = false; if (enable_preemption_timer) { u64 use_timer_freq = 5000ULL * 1000 * 1000; u64 vmx_msr; rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); cpu_preemption_timer_multi = vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; if (tsc_khz) use_timer_freq = (u64)tsc_khz * 1000; use_timer_freq >>= cpu_preemption_timer_multi; /* * KVM "disables" the preemption timer by setting it to its max * value. Don't use the timer if it might cause spurious exits * at a rate faster than 0.1 Hz (of uninterrupted guest time). */ if (use_timer_freq > 0xffffffffu / 10) enable_preemption_timer = false; } if (!enable_preemption_timer) { vmx_x86_ops.set_hv_timer = NULL; vmx_x86_ops.cancel_hv_timer = NULL; vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } kvm_mce_cap_supported |= MCG_LMCE_P; if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) return -EINVAL; if (!enable_ept || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; setup_default_sgx_lepubkeyhash(); if (nested) { nested_vmx_setup_ctls_msrs(&vmcs_config.nested, vmx_capability.ept); r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); if (r) return r; } vmx_set_cpu_caps(); r = alloc_kvm_area(); if (r) nested_vmx_hardware_unsetup(); kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); return r; } static struct kvm_x86_init_ops vmx_init_ops __initdata = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, .check_processor_compatibility = vmx_check_processor_compat, .hardware_setup = hardware_setup, .intel_pt_intr_in_guest = vmx_pt_mode_is_host_guest, .runtime_ops = &vmx_x86_ops, }; static void vmx_cleanup_l1d_flush(void) { if (vmx_l1d_flush_pages) { free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); vmx_l1d_flush_pages = NULL; } /* Restore state so sysfs ignores VMX */ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; } static void vmx_exit(void) { #ifdef CONFIG_KEXEC_CORE RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu(); #endif kvm_exit(); #if IS_ENABLED(CONFIG_HYPERV) if (static_branch_unlikely(&enable_evmcs)) { int cpu; struct hv_vp_assist_page *vp_ap; /* * Reset everything to support using non-enlightened VMCS * access later (e.g. when we reload the module with * enlightened_vmcs=0) */ for_each_online_cpu(cpu) { vp_ap = hv_get_vp_assist_page(cpu); if (!vp_ap) continue; vp_ap->nested_control.features.directhypercall = 0; vp_ap->current_nested_vmcs = 0; vp_ap->enlighten_vmentry = 0; } static_branch_disable(&enable_evmcs); } #endif vmx_cleanup_l1d_flush(); allow_smaller_maxphyaddr = false; } module_exit(vmx_exit); static int __init vmx_init(void) { int r, cpu; #if IS_ENABLED(CONFIG_HYPERV) /* * Enlightened VMCS usage should be recommended and the host needs * to support eVMCS v1 or above. We can also disable eVMCS support * with module parameter. */ if (enlightened_vmcs && ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= KVM_EVMCS_VERSION) { int cpu; /* Check that we have assist pages on all online CPUs */ for_each_online_cpu(cpu) { if (!hv_get_vp_assist_page(cpu)) { enlightened_vmcs = false; break; } } if (enlightened_vmcs) { pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); static_branch_enable(&enable_evmcs); } if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) vmx_x86_ops.enable_direct_tlbflush = hv_enable_direct_tlbflush; } else { enlightened_vmcs = false; } #endif r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) return r; /* * Must be called after kvm_init() so enable_ept is properly set * up. Hand the parameter mitigation value in which was stored in * the pre module init parser. If no parameter was given, it will * contain 'auto' which will be turned into the default 'cond' * mitigation mode. */ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); if (r) { vmx_exit(); return r; } vmx_setup_fb_clear_ctrl(); for_each_possible_cpu(cpu) { INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); pi_init_cpu(cpu); } #ifdef CONFIG_KEXEC_CORE rcu_assign_pointer(crash_vmclear_loaded_vmcss, crash_vmclear_local_loaded_vmcss); #endif vmx_check_vmcs12_offsets(); /* * Shadow paging doesn't have a (further) performance penalty * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it * by default */ if (!enable_ept) allow_smaller_maxphyaddr = true; return 0; } module_init(vmx_init); |
1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 | // SPDX-License-Identifier: GPL-2.0+ /* * Support for emulating SAT (ata pass through) on devices based * on the Cypress USB/ATA bridge supporting ATACB. * * Copyright (c) 2008 Matthieu Castet (castet.matthieu@free.fr) */ #include <linux/module.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_eh.h> #include <linux/ata.h> #include "usb.h" #include "protocol.h" #include "scsiglue.h" #include "debug.h" #define DRV_NAME "ums-cypress" MODULE_DESCRIPTION("SAT support for Cypress USB/ATA bridges with ATACB"); MODULE_AUTHOR("Matthieu Castet <castet.matthieu@free.fr>"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS(USB_STORAGE); /* * The table of devices */ #define UNUSUAL_DEV(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax, \ vendorName, productName, useProtocol, useTransport, \ initFunction, flags) \ { USB_DEVICE_VER(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax), \ .driver_info = (flags) } static struct usb_device_id cypress_usb_ids[] = { # include "unusual_cypress.h" { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, cypress_usb_ids); #undef UNUSUAL_DEV /* * The flags table */ #define UNUSUAL_DEV(idVendor, idProduct, bcdDeviceMin, bcdDeviceMax, \ vendor_name, product_name, use_protocol, use_transport, \ init_function, Flags) \ { \ .vendorName = vendor_name, \ .productName = product_name, \ .useProtocol = use_protocol, \ .useTransport = use_transport, \ .initFunction = init_function, \ } static struct us_unusual_dev cypress_unusual_dev_list[] = { # include "unusual_cypress.h" { } /* Terminating entry */ }; #undef UNUSUAL_DEV /* * ATACB is a protocol used on cypress usb<->ata bridge to * send raw ATA command over mass storage * There is a ATACB2 protocol that support LBA48 on newer chip. * More info that be found on cy7c68310_8.pdf and cy7c68300c_8.pdf * datasheet from cypress.com. */ static void cypress_atacb_passthrough(struct scsi_cmnd *srb, struct us_data *us) { unsigned char save_cmnd[MAX_COMMAND_SIZE]; if (likely(srb->cmnd[0] != ATA_16 && srb->cmnd[0] != ATA_12)) { usb_stor_transparent_scsi_command(srb, us); return; } memcpy(save_cmnd, srb->cmnd, sizeof(save_cmnd)); memset(srb->cmnd, 0, MAX_COMMAND_SIZE); /* check if we support the command */ if (save_cmnd[1] >> 5) /* MULTIPLE_COUNT */ goto invalid_fld; /* check protocol */ switch ((save_cmnd[1] >> 1) & 0xf) { case 3: /*no DATA */ case 4: /* PIO in */ case 5: /* PIO out */ break; default: goto invalid_fld; } /* first build the ATACB command */ srb->cmd_len = 16; srb->cmnd[0] = 0x24; /* * bVSCBSignature : vendor-specific command * this value can change, but most(all ?) manufacturers * keep the cypress default : 0x24 */ srb->cmnd[1] = 0x24; /* bVSCBSubCommand : 0x24 for ATACB */ srb->cmnd[3] = 0xff - 1; /* * features, sector count, lba low, lba med * lba high, device, command are valid */ srb->cmnd[4] = 1; /* TransferBlockCount : 512 */ if (save_cmnd[0] == ATA_16) { srb->cmnd[ 6] = save_cmnd[ 4]; /* features */ srb->cmnd[ 7] = save_cmnd[ 6]; /* sector count */ srb->cmnd[ 8] = save_cmnd[ 8]; /* lba low */ srb->cmnd[ 9] = save_cmnd[10]; /* lba med */ srb->cmnd[10] = save_cmnd[12]; /* lba high */ srb->cmnd[11] = save_cmnd[13]; /* device */ srb->cmnd[12] = save_cmnd[14]; /* command */ if (save_cmnd[1] & 0x01) {/* extended bit set for LBA48 */ /* this could be supported by atacb2 */ if (save_cmnd[3] || save_cmnd[5] || save_cmnd[7] || save_cmnd[9] || save_cmnd[11]) goto invalid_fld; } } else { /* ATA12 */ srb->cmnd[ 6] = save_cmnd[3]; /* features */ srb->cmnd[ 7] = save_cmnd[4]; /* sector count */ srb->cmnd[ 8] = save_cmnd[5]; /* lba low */ srb->cmnd[ 9] = save_cmnd[6]; /* lba med */ srb->cmnd[10] = save_cmnd[7]; /* lba high */ srb->cmnd[11] = save_cmnd[8]; /* device */ srb->cmnd[12] = save_cmnd[9]; /* command */ } /* Filter SET_FEATURES - XFER MODE command */ if ((srb->cmnd[12] == ATA_CMD_SET_FEATURES) && (srb->cmnd[6] == SETFEATURES_XFER)) goto invalid_fld; if (srb->cmnd[12] == ATA_CMD_ID_ATA || srb->cmnd[12] == ATA_CMD_ID_ATAPI) srb->cmnd[2] |= (1<<7); /* set IdentifyPacketDevice for these cmds */ usb_stor_transparent_scsi_command(srb, us); /* if the device doesn't support ATACB */ if (srb->result == SAM_STAT_CHECK_CONDITION && memcmp(srb->sense_buffer, usb_stor_sense_invalidCDB, sizeof(usb_stor_sense_invalidCDB)) == 0) { usb_stor_dbg(us, "cypress atacb not supported ???\n"); goto end; } /* * if ck_cond flags is set, and there wasn't critical error, * build the special sense */ if ((srb->result != (DID_ERROR << 16) && srb->result != (DID_ABORT << 16)) && save_cmnd[2] & 0x20) { struct scsi_eh_save ses; unsigned char regs[8]; unsigned char *sb = srb->sense_buffer; unsigned char *desc = sb + 8; int tmp_result; /* build the command for reading the ATA registers */ scsi_eh_prep_cmnd(srb, &ses, NULL, 0, sizeof(regs)); /* * we use the same command as before, but we set * the read taskfile bit, for not executing atacb command, * but reading register selected in srb->cmnd[4] */ srb->cmd_len = 16; srb->cmnd = ses.cmnd; srb->cmnd[2] = 1; usb_stor_transparent_scsi_command(srb, us); memcpy(regs, srb->sense_buffer, sizeof(regs)); tmp_result = srb->result; scsi_eh_restore_cmnd(srb, &ses); /* we fail to get registers, report invalid command */ if (tmp_result != SAM_STAT_GOOD) goto invalid_fld; /* build the sense */ memset(sb, 0, SCSI_SENSE_BUFFERSIZE); /* set sk, asc for a good command */ sb[1] = RECOVERED_ERROR; sb[2] = 0; /* ATA PASS THROUGH INFORMATION AVAILABLE */ sb[3] = 0x1D; /* * XXX we should generate sk, asc, ascq from status and error * regs * (see 11.1 Error translation ATA device error to SCSI error * map, and ata_to_sense_error from libata.) */ /* Sense data is current and format is descriptor. */ sb[0] = 0x72; desc[0] = 0x09; /* ATA_RETURN_DESCRIPTOR */ /* set length of additional sense data */ sb[7] = 14; desc[1] = 12; /* Copy registers into sense buffer. */ desc[ 2] = 0x00; desc[ 3] = regs[1]; /* features */ desc[ 5] = regs[2]; /* sector count */ desc[ 7] = regs[3]; /* lba low */ desc[ 9] = regs[4]; /* lba med */ desc[11] = regs[5]; /* lba high */ desc[12] = regs[6]; /* device */ desc[13] = regs[7]; /* command */ srb->result = SAM_STAT_CHECK_CONDITION; } goto end; invalid_fld: srb->result = SAM_STAT_CHECK_CONDITION; memcpy(srb->sense_buffer, usb_stor_sense_invalidCDB, sizeof(usb_stor_sense_invalidCDB)); end: memcpy(srb->cmnd, save_cmnd, sizeof(save_cmnd)); if (srb->cmnd[0] == ATA_12) srb->cmd_len = 12; } static struct scsi_host_template cypress_host_template; static int cypress_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct us_data *us; int result; struct usb_device *device; result = usb_stor_probe1(&us, intf, id, (id - cypress_usb_ids) + cypress_unusual_dev_list, &cypress_host_template); if (result) return result; /* * Among CY7C68300 chips, the A revision does not support Cypress ATACB * Filter out this revision from EEPROM default descriptor values */ device = interface_to_usbdev(intf); if (device->descriptor.iManufacturer != 0x38 || device->descriptor.iProduct != 0x4e || device->descriptor.iSerialNumber != 0x64) { us->protocol_name = "Transparent SCSI with Cypress ATACB"; us->proto_handler = cypress_atacb_passthrough; } else { us->protocol_name = "Transparent SCSI"; us->proto_handler = usb_stor_transparent_scsi_command; } result = usb_stor_probe2(us); return result; } static struct usb_driver cypress_driver = { .name = DRV_NAME, .probe = cypress_probe, .disconnect = usb_stor_disconnect, .suspend = usb_stor_suspend, .resume = usb_stor_resume, .reset_resume = usb_stor_reset_resume, .pre_reset = usb_stor_pre_reset, .post_reset = usb_stor_post_reset, .id_table = cypress_usb_ids, .soft_unbind = 1, .no_dynamic_id = 1, }; module_usb_stor_driver(cypress_driver, cypress_host_template, DRV_NAME); |
109 65 4 47 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions of the Internet Protocol. * * Version: @(#)in.h 1.0.1 04/21/93 * * Authors: Original taken from the GNU Project <netinet/in.h> file. * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_IN_H #define _LINUX_IN_H #include <linux/errno.h> #include <uapi/linux/in.h> static inline int proto_ports_offset(int proto) { switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_DCCP: case IPPROTO_ESP: /* SPI */ case IPPROTO_SCTP: case IPPROTO_UDPLITE: return 0; case IPPROTO_AH: /* SPI */ return 4; default: return -EINVAL; } } static inline bool ipv4_is_loopback(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x7f000000); } static inline bool ipv4_is_multicast(__be32 addr) { return (addr & htonl(0xf0000000)) == htonl(0xe0000000); } static inline bool ipv4_is_local_multicast(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xe0000000); } static inline bool ipv4_is_lbcast(__be32 addr) { /* limited broadcast */ return addr == htonl(INADDR_BROADCAST); } static inline bool ipv4_is_all_snoopers(__be32 addr) { return addr == htonl(INADDR_ALLSNOOPERS_GROUP); } static inline bool ipv4_is_zeronet(__be32 addr) { return (addr == 0); } /* Special-Use IPv4 Addresses (RFC3330) */ static inline bool ipv4_is_private_10(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x0a000000); } static inline bool ipv4_is_private_172(__be32 addr) { return (addr & htonl(0xfff00000)) == htonl(0xac100000); } static inline bool ipv4_is_private_192(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xc0a80000); } static inline bool ipv4_is_linklocal_169(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xa9fe0000); } static inline bool ipv4_is_anycast_6to4(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0586300); } static inline bool ipv4_is_test_192(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0000200); } static inline bool ipv4_is_test_198(__be32 addr) { return (addr & htonl(0xfffe0000)) == htonl(0xc6120000); } #endif /* _LINUX_IN_H */ |
215 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* * include/linux/eventpoll.h ( Efficient event polling implementation ) * Copyright (C) 2001,...,2006 Davide Libenzi * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Davide Libenzi <davidel@xmailserver.org> * */ #ifndef _UAPI_LINUX_EVENTPOLL_H #define _UAPI_LINUX_EVENTPOLL_H /* For O_CLOEXEC */ #include <linux/fcntl.h> #include <linux/types.h> /* Flags for epoll_create1. */ #define EPOLL_CLOEXEC O_CLOEXEC /* Valid opcodes to issue to sys_epoll_ctl() */ #define EPOLL_CTL_ADD 1 #define EPOLL_CTL_DEL 2 #define EPOLL_CTL_MOD 3 /* Epoll event masks */ #define EPOLLIN (__force __poll_t)0x00000001 #define EPOLLPRI (__force __poll_t)0x00000002 #define EPOLLOUT (__force __poll_t)0x00000004 #define EPOLLERR (__force __poll_t)0x00000008 #define EPOLLHUP (__force __poll_t)0x00000010 #define EPOLLNVAL (__force __poll_t)0x00000020 #define EPOLLRDNORM (__force __poll_t)0x00000040 #define EPOLLRDBAND (__force __poll_t)0x00000080 #define EPOLLWRNORM (__force __poll_t)0x00000100 #define EPOLLWRBAND (__force __poll_t)0x00000200 #define EPOLLMSG (__force __poll_t)0x00000400 #define EPOLLRDHUP (__force __poll_t)0x00002000 /* * Internal flag - wakeup generated by io_uring, used to detect recursion back * into the io_uring poll handler. */ #define EPOLL_URING_WAKE ((__force __poll_t)(1U << 27)) /* Set exclusive wakeup mode for the target file descriptor */ #define EPOLLEXCLUSIVE ((__force __poll_t)(1U << 28)) /* * Request the handling of system wakeup events so as to prevent system suspends * from happening while those events are being processed. * * Assuming neither EPOLLET nor EPOLLONESHOT is set, system suspends will not be * re-allowed until epoll_wait is called again after consuming the wakeup * event(s). * * Requires CAP_BLOCK_SUSPEND */ #define EPOLLWAKEUP ((__force __poll_t)(1U << 29)) /* Set the One Shot behaviour for the target file descriptor */ #define EPOLLONESHOT ((__force __poll_t)(1U << 30)) /* Set the Edge Triggered behaviour for the target file descriptor */ #define EPOLLET ((__force __poll_t)(1U << 31)) /* * On x86-64 make the 64bit structure have the same alignment as the * 32bit structure. This makes 32bit emulation easier. * * UML/x86_64 needs the same packing as x86_64 */ #ifdef __x86_64__ #define EPOLL_PACKED __attribute__((packed)) #else #define EPOLL_PACKED #endif struct epoll_event { __poll_t events; __u64 data; } EPOLL_PACKED; #ifdef CONFIG_PM_SLEEP static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) { if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND)) epev->events &= ~EPOLLWAKEUP; } #else static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) { epev->events &= ~EPOLLWAKEUP; } #endif #endif /* _UAPI_LINUX_EVENTPOLL_H */ |
98 | 1 2 3 4 5 6 7 8 9 10 11 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_NS_HASH_H__ #define __NET_NS_HASH_H__ #include <net/net_namespace.h> static inline u32 net_hash_mix(const struct net *net) { return net->hash_mix; } #endif |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2016 HGST, a Western Digital Company. */ #include <linux/moduleparam.h> #include <linux/slab.h> #include <linux/pci-p2pdma.h> #include <rdma/mr_pool.h> #include <rdma/rw.h> enum { RDMA_RW_SINGLE_WR, RDMA_RW_MULTI_WR, RDMA_RW_MR, RDMA_RW_SIG_MR, }; static bool rdma_rw_force_mr; module_param_named(force_mr, rdma_rw_force_mr, bool, 0); MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations"); /* * Report whether memory registration should be used. Memory registration must * be used for iWarp devices because of iWARP-specific limitations. Memory * registration is also enabled if registering memory might yield better * performance than using multiple SGE entries, see rdma_rw_io_needs_mr() */ static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u32 port_num) { if (rdma_protocol_iwarp(dev, port_num)) return true; if (dev->attrs.max_sgl_rd) return true; if (unlikely(rdma_rw_force_mr)) return true; return false; } /* * Check if the device will use memory registration for this RW operation. * For RDMA READs we must use MRs on iWarp and can optionally use them as an * optimization otherwise. Additionally we have a debug option to force usage * of MRs to help testing this code path. */ static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u32 port_num, enum dma_data_direction dir, int dma_nents) { if (dir == DMA_FROM_DEVICE) { if (rdma_protocol_iwarp(dev, port_num)) return true; if (dev->attrs.max_sgl_rd && dma_nents > dev->attrs.max_sgl_rd) return true; } if (unlikely(rdma_rw_force_mr)) return true; return false; } static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev, bool pi_support) { u32 max_pages; if (pi_support) max_pages = dev->attrs.max_pi_fast_reg_page_list_len; else max_pages = dev->attrs.max_fast_reg_page_list_len; /* arbitrary limit to avoid allocating gigantic resources */ return min_t(u32, max_pages, 256); } static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg) { int count = 0; if (reg->mr->need_inval) { reg->inv_wr.opcode = IB_WR_LOCAL_INV; reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey; reg->inv_wr.next = ®->reg_wr.wr; count++; } else { reg->inv_wr.next = NULL; } return count; } /* Caller must have zero-initialized *reg. */ static int rdma_rw_init_one_mr(struct ib_qp *qp, u32 port_num, struct rdma_rw_reg_ctx *reg, struct scatterlist *sg, u32 sg_cnt, u32 offset) { u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, qp->integrity_en); u32 nents = min(sg_cnt, pages_per_mr); int count = 0, ret; reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs); if (!reg->mr) return -EAGAIN; count += rdma_rw_inv_key(reg); ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE); if (ret < 0 || ret < nents) { ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr); return -EINVAL; } reg->reg_wr.wr.opcode = IB_WR_REG_MR; reg->reg_wr.mr = reg->mr; reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE; if (rdma_protocol_iwarp(qp->device, port_num)) reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE; count++; reg->sge.addr = reg->mr->iova; reg->sge.length = reg->mr->length; return count; } static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct rdma_rw_reg_ctx *prev = NULL; u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, qp->integrity_en); int i, j, ret = 0, count = 0; ctx->nr_ops = DIV_ROUND_UP(sg_cnt, pages_per_mr); ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL); if (!ctx->reg) { ret = -ENOMEM; goto out; } for (i = 0; i < ctx->nr_ops; i++) { struct rdma_rw_reg_ctx *reg = &ctx->reg[i]; u32 nents = min(sg_cnt, pages_per_mr); ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sg_cnt, offset); if (ret < 0) goto out_free; count += ret; if (prev) { if (reg->mr->need_inval) prev->wr.wr.next = ®->inv_wr; else prev->wr.wr.next = ®->reg_wr.wr; } reg->reg_wr.wr.next = ®->wr.wr; reg->wr.wr.sg_list = ®->sge; reg->wr.wr.num_sge = 1; reg->wr.remote_addr = remote_addr; reg->wr.rkey = rkey; if (dir == DMA_TO_DEVICE) { reg->wr.wr.opcode = IB_WR_RDMA_WRITE; } else if (!rdma_cap_read_inv(qp->device, port_num)) { reg->wr.wr.opcode = IB_WR_RDMA_READ; } else { reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey; } count++; remote_addr += reg->sge.length; sg_cnt -= nents; for (j = 0; j < nents; j++) sg = sg_next(sg); prev = reg; offset = 0; } if (prev) prev->wr.wr.next = NULL; ctx->type = RDMA_RW_MR; return count; out_free: while (--i >= 0) ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); kfree(ctx->reg); out: return ret; } static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct scatterlist *sg, u32 sg_cnt, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge : qp->max_read_sge; struct ib_sge *sge; u32 total_len = 0, i, j; ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge); ctx->map.sges = sge = kcalloc(sg_cnt, sizeof(*sge), GFP_KERNEL); if (!ctx->map.sges) goto out; ctx->map.wrs = kcalloc(ctx->nr_ops, sizeof(*ctx->map.wrs), GFP_KERNEL); if (!ctx->map.wrs) goto out_free_sges; for (i = 0; i < ctx->nr_ops; i++) { struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i]; u32 nr_sge = min(sg_cnt, max_sge); if (dir == DMA_TO_DEVICE) rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; else rdma_wr->wr.opcode = IB_WR_RDMA_READ; rdma_wr->remote_addr = remote_addr + total_len; rdma_wr->rkey = rkey; rdma_wr->wr.num_sge = nr_sge; rdma_wr->wr.sg_list = sge; for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) { sge->addr = sg_dma_address(sg) + offset; sge->length = sg_dma_len(sg) - offset; sge->lkey = qp->pd->local_dma_lkey; total_len += sge->length; sge++; sg_cnt--; offset = 0; } rdma_wr->wr.next = i + 1 < ctx->nr_ops ? &ctx->map.wrs[i + 1].wr : NULL; } ctx->type = RDMA_RW_MULTI_WR; return ctx->nr_ops; out_free_sges: kfree(ctx->map.sges); out: return -ENOMEM; } static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct ib_rdma_wr *rdma_wr = &ctx->single.wr; ctx->nr_ops = 1; ctx->single.sge.lkey = qp->pd->local_dma_lkey; ctx->single.sge.addr = sg_dma_address(sg) + offset; ctx->single.sge.length = sg_dma_len(sg) - offset; memset(rdma_wr, 0, sizeof(*rdma_wr)); if (dir == DMA_TO_DEVICE) rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; else rdma_wr->wr.opcode = IB_WR_RDMA_READ; rdma_wr->wr.sg_list = &ctx->single.sge; rdma_wr->wr.num_sge = 1; rdma_wr->remote_addr = remote_addr; rdma_wr->rkey = rkey; ctx->type = RDMA_RW_SINGLE_WR; return 1; } static void rdma_rw_unmap_sg(struct ib_device *dev, struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir) { if (is_pci_p2pdma_page(sg_page(sg))) pci_p2pdma_unmap_sg(dev->dma_device, sg, sg_cnt, dir); else ib_dma_unmap_sg(dev, sg, sg_cnt, dir); } static int rdma_rw_map_sg(struct ib_device *dev, struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir) { if (is_pci_p2pdma_page(sg_page(sg))) { if (WARN_ON_ONCE(ib_uses_virt_dma(dev))) return 0; return pci_p2pdma_map_sg(dev->dma_device, sg, sg_cnt, dir); } return ib_dma_map_sg(dev, sg, sg_cnt, dir); } /** * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context * @ctx: context to initialize * @qp: queue pair to operate on * @port_num: port num to which the connection is bound * @sg: scatterlist to READ/WRITE from/to * @sg_cnt: number of entries in @sg * @sg_offset: current byte offset into @sg * @remote_addr:remote address to read/write (relative to @rkey) * @rkey: remote key to operate on * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ * * Returns the number of WQEs that will be needed on the workqueue if * successful, or a negative error code. */ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 sg_offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct ib_device *dev = qp->pd->device; int ret; ret = rdma_rw_map_sg(dev, sg, sg_cnt, dir); if (!ret) return -ENOMEM; sg_cnt = ret; /* * Skip to the S/G entry that sg_offset falls into: */ for (;;) { u32 len = sg_dma_len(sg); if (sg_offset < len) break; sg = sg_next(sg); sg_offset -= len; sg_cnt--; } ret = -EIO; if (WARN_ON_ONCE(sg_cnt == 0)) goto out_unmap_sg; if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) { ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt, sg_offset, remote_addr, rkey, dir); } else if (sg_cnt > 1) { ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset, remote_addr, rkey, dir); } else { ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset, remote_addr, rkey, dir); } if (ret < 0) goto out_unmap_sg; return ret; out_unmap_sg: rdma_rw_unmap_sg(dev, sg, sg_cnt, dir); return ret; } EXPORT_SYMBOL(rdma_rw_ctx_init); /** * rdma_rw_ctx_signature_init - initialize a RW context with signature offload * @ctx: context to initialize * @qp: queue pair to operate on * @port_num: port num to which the connection is bound * @sg: scatterlist to READ/WRITE from/to * @sg_cnt: number of entries in @sg * @prot_sg: scatterlist to READ/WRITE protection information from/to * @prot_sg_cnt: number of entries in @prot_sg * @sig_attrs: signature offloading algorithms * @remote_addr:remote address to read/write (relative to @rkey) * @rkey: remote key to operate on * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ * * Returns the number of WQEs that will be needed on the workqueue if * successful, or a negative error code. */ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct scatterlist *sg, u32 sg_cnt, struct scatterlist *prot_sg, u32 prot_sg_cnt, struct ib_sig_attrs *sig_attrs, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct ib_device *dev = qp->pd->device; u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, qp->integrity_en); struct ib_rdma_wr *rdma_wr; int count = 0, ret; if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) { pr_err("SG count too large: sg_cnt=%u, prot_sg_cnt=%u, pages_per_mr=%u\n", sg_cnt, prot_sg_cnt, pages_per_mr); return -EINVAL; } ret = rdma_rw_map_sg(dev, sg, sg_cnt, dir); if (!ret) return -ENOMEM; sg_cnt = ret; if (prot_sg_cnt) { ret = rdma_rw_map_sg(dev, prot_sg, prot_sg_cnt, dir); if (!ret) { ret = -ENOMEM; goto out_unmap_sg; } prot_sg_cnt = ret; } ctx->type = RDMA_RW_SIG_MR; ctx->nr_ops = 1; ctx->reg = kzalloc(sizeof(*ctx->reg), GFP_KERNEL); if (!ctx->reg) { ret = -ENOMEM; goto out_unmap_prot_sg; } ctx->reg->mr = ib_mr_pool_get(qp, &qp->sig_mrs); if (!ctx->reg->mr) { ret = -EAGAIN; goto out_free_ctx; } count += rdma_rw_inv_key(ctx->reg); memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs)); ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sg_cnt, NULL, prot_sg, prot_sg_cnt, NULL, SZ_4K); if (unlikely(ret)) { pr_err("failed to map PI sg (%u)\n", sg_cnt + prot_sg_cnt); goto out_destroy_sig_mr; } ctx->reg->reg_wr.wr.opcode = IB_WR_REG_MR_INTEGRITY; ctx->reg->reg_wr.wr.wr_cqe = NULL; ctx->reg->reg_wr.wr.num_sge = 0; ctx->reg->reg_wr.wr.send_flags = 0; ctx->reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE; if (rdma_protocol_iwarp(qp->device, port_num)) ctx->reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE; ctx->reg->reg_wr.mr = ctx->reg->mr; ctx->reg->reg_wr.key = ctx->reg->mr->lkey; count++; ctx->reg->sge.addr = ctx->reg->mr->iova; ctx->reg->sge.length = ctx->reg->mr->length; if (sig_attrs->wire.sig_type == IB_SIG_TYPE_NONE) ctx->reg->sge.length -= ctx->reg->mr->sig_attrs->meta_length; rdma_wr = &ctx->reg->wr; rdma_wr->wr.sg_list = &ctx->reg->sge; rdma_wr->wr.num_sge = 1; rdma_wr->remote_addr = remote_addr; rdma_wr->rkey = rkey; if (dir == DMA_TO_DEVICE) rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; else rdma_wr->wr.opcode = IB_WR_RDMA_READ; ctx->reg->reg_wr.wr.next = &rdma_wr->wr; count++; return count; out_destroy_sig_mr: ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr); out_free_ctx: kfree(ctx->reg); out_unmap_prot_sg: if (prot_sg_cnt) rdma_rw_unmap_sg(dev, prot_sg, prot_sg_cnt, dir); out_unmap_sg: rdma_rw_unmap_sg(dev, sg, sg_cnt, dir); return ret; } EXPORT_SYMBOL(rdma_rw_ctx_signature_init); /* * Now that we are going to post the WRs we can update the lkey and need_inval * state on the MRs. If we were doing this at init time, we would get double * or missing invalidations if a context was initialized but not actually * posted. */ static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval) { reg->mr->need_inval = need_inval; ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey)); reg->reg_wr.key = reg->mr->lkey; reg->sge.lkey = reg->mr->lkey; } /** * rdma_rw_ctx_wrs - return chain of WRs for a RDMA READ or WRITE operation * @ctx: context to operate on * @qp: queue pair to operate on * @port_num: port num to which the connection is bound * @cqe: completion queue entry for the last WR * @chain_wr: WR to append to the posted chain * * Return the WR chain for the set of RDMA READ/WRITE operations described by * @ctx, as well as any memory registration operations needed. If @chain_wr * is non-NULL the WR it points to will be appended to the chain of WRs posted. * If @chain_wr is not set @cqe must be set so that the caller gets a * completion notification. */ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr) { struct ib_send_wr *first_wr, *last_wr; int i; switch (ctx->type) { case RDMA_RW_SIG_MR: case RDMA_RW_MR: for (i = 0; i < ctx->nr_ops; i++) { rdma_rw_update_lkey(&ctx->reg[i], ctx->reg[i].wr.wr.opcode != IB_WR_RDMA_READ_WITH_INV); } if (ctx->reg[0].inv_wr.next) first_wr = &ctx->reg[0].inv_wr; else first_wr = &ctx->reg[0].reg_wr.wr; last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr; break; case RDMA_RW_MULTI_WR: first_wr = &ctx->map.wrs[0].wr; last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr; break; case RDMA_RW_SINGLE_WR: first_wr = &ctx->single.wr.wr; last_wr = &ctx->single.wr.wr; break; default: BUG(); } if (chain_wr) { last_wr->next = chain_wr; } else { last_wr->wr_cqe = cqe; last_wr->send_flags |= IB_SEND_SIGNALED; } return first_wr; } EXPORT_SYMBOL(rdma_rw_ctx_wrs); /** * rdma_rw_ctx_post - post a RDMA READ or RDMA WRITE operation * @ctx: context to operate on * @qp: queue pair to operate on * @port_num: port num to which the connection is bound * @cqe: completion queue entry for the last WR * @chain_wr: WR to append to the posted chain * * Post the set of RDMA READ/WRITE operations described by @ctx, as well as * any memory registration operations needed. If @chain_wr is non-NULL the * WR it points to will be appended to the chain of WRs posted. If @chain_wr * is not set @cqe must be set so that the caller gets a completion * notification. */ int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr) { struct ib_send_wr *first_wr; first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr); return ib_post_send(qp, first_wr, NULL); } EXPORT_SYMBOL(rdma_rw_ctx_post); /** * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init * @ctx: context to release * @qp: queue pair to operate on * @port_num: port num to which the connection is bound * @sg: scatterlist that was used for the READ/WRITE * @sg_cnt: number of entries in @sg * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ */ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir) { int i; switch (ctx->type) { case RDMA_RW_MR: for (i = 0; i < ctx->nr_ops; i++) ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); kfree(ctx->reg); break; case RDMA_RW_MULTI_WR: kfree(ctx->map.wrs); kfree(ctx->map.sges); break; case RDMA_RW_SINGLE_WR: break; default: BUG(); break; } rdma_rw_unmap_sg(qp->pd->device, sg, sg_cnt, dir); } EXPORT_SYMBOL(rdma_rw_ctx_destroy); /** * rdma_rw_ctx_destroy_signature - release all resources allocated by * rdma_rw_ctx_signature_init * @ctx: context to release * @qp: queue pair to operate on * @port_num: port num to which the connection is bound * @sg: scatterlist that was used for the READ/WRITE * @sg_cnt: number of entries in @sg * @prot_sg: scatterlist that was used for the READ/WRITE of the PI * @prot_sg_cnt: number of entries in @prot_sg * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ */ void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num, struct scatterlist *sg, u32 sg_cnt, struct scatterlist *prot_sg, u32 prot_sg_cnt, enum dma_data_direction dir) { if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR)) return; ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr); kfree(ctx->reg); if (prot_sg_cnt) rdma_rw_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir); rdma_rw_unmap_sg(qp->pd->device, sg, sg_cnt, dir); } EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature); /** * rdma_rw_mr_factor - return number of MRs required for a payload * @device: device handling the connection * @port_num: port num to which the connection is bound * @maxpages: maximum payload pages per rdma_rw_ctx * * Returns the number of MRs the device requires to move @maxpayload * bytes. The returned value is used during transport creation to * compute max_rdma_ctxts and the size of the transport's Send and * Send Completion Queues. */ unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num, unsigned int maxpages) { unsigned int mr_pages; if (rdma_rw_can_use_mr(device, port_num)) mr_pages = rdma_rw_fr_page_list_len(device, false); else mr_pages = device->attrs.max_sge_rd; return DIV_ROUND_UP(maxpages, mr_pages); } EXPORT_SYMBOL(rdma_rw_mr_factor); void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) { u32 factor; WARN_ON_ONCE(attr->port_num == 0); /* * Each context needs at least one RDMA READ or WRITE WR. * * For some hardware we might need more, eventually we should ask the * HCA driver for a multiplier here. */ factor = 1; /* * If the devices needs MRs to perform RDMA READ or WRITE operations, * we'll need two additional MRs for the registrations and the * invalidation. */ if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN || rdma_rw_can_use_mr(dev, attr->port_num)) factor += 2; /* inv + reg */ attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs; /* * But maybe we were just too high in the sky and the device doesn't * even support all we need, and we'll have to live with what we get.. */ attr->cap.max_send_wr = min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr); } int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) { struct ib_device *dev = qp->pd->device; u32 nr_mrs = 0, nr_sig_mrs = 0, max_num_sg = 0; int ret = 0; if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) { nr_sig_mrs = attr->cap.max_rdma_ctxs; nr_mrs = attr->cap.max_rdma_ctxs; max_num_sg = rdma_rw_fr_page_list_len(dev, true); } else if (rdma_rw_can_use_mr(dev, attr->port_num)) { nr_mrs = attr->cap.max_rdma_ctxs; max_num_sg = rdma_rw_fr_page_list_len(dev, false); } if (nr_mrs) { ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs, IB_MR_TYPE_MEM_REG, max_num_sg, 0); if (ret) { pr_err("%s: failed to allocated %u MRs\n", __func__, nr_mrs); return ret; } } if (nr_sig_mrs) { ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs, IB_MR_TYPE_INTEGRITY, max_num_sg, max_num_sg); if (ret) { pr_err("%s: failed to allocated %u SIG MRs\n", __func__, nr_sig_mrs); goto out_free_rdma_mrs; } } return 0; out_free_rdma_mrs: ib_mr_pool_destroy(qp, &qp->rdma_mrs); return ret; } void rdma_rw_cleanup_mrs(struct ib_qp *qp) { ib_mr_pool_destroy(qp, &qp->sig_mrs); ib_mr_pool_destroy(qp, &qp->rdma_mrs); } |
552 2 264 144 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Scatterlist Cryptographic API. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2002 David S. Miller (davem@redhat.com) * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> * * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no> * and Nettle, by Niels Möller. */ #ifndef _LINUX_CRYPTO_H #define _LINUX_CRYPTO_H #include <linux/atomic.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/bug.h> #include <linux/refcount.h> #include <linux/slab.h> #include <linux/completion.h> /* * Autoloaded crypto modules should only use a prefixed name to avoid allowing * arbitrary modules to be loaded. Loading from userspace may still need the * unprefixed names, so retains those aliases as well. * This uses __MODULE_INFO directly instead of MODULE_ALIAS because pre-4.3 * gcc (e.g. avr32 toolchain) uses __LINE__ for uniqueness, and this macro * expands twice on the same line. Instead, use a separate base name for the * alias. */ #define MODULE_ALIAS_CRYPTO(name) \ __MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_crypto, "crypto-" name) /* * Algorithm masks and types. */ #define CRYPTO_ALG_TYPE_MASK 0x0000000f #define CRYPTO_ALG_TYPE_CIPHER 0x00000001 #define CRYPTO_ALG_TYPE_COMPRESS 0x00000002 #define CRYPTO_ALG_TYPE_AEAD 0x00000003 #define CRYPTO_ALG_TYPE_SKCIPHER 0x00000005 #define CRYPTO_ALG_TYPE_KPP 0x00000008 #define CRYPTO_ALG_TYPE_ACOMPRESS 0x0000000a #define CRYPTO_ALG_TYPE_SCOMPRESS 0x0000000b #define CRYPTO_ALG_TYPE_RNG 0x0000000c #define CRYPTO_ALG_TYPE_AKCIPHER 0x0000000d #define CRYPTO_ALG_TYPE_HASH 0x0000000e #define CRYPTO_ALG_TYPE_SHASH 0x0000000e #define CRYPTO_ALG_TYPE_AHASH 0x0000000f #define CRYPTO_ALG_TYPE_HASH_MASK 0x0000000e #define CRYPTO_ALG_TYPE_AHASH_MASK 0x0000000e #define CRYPTO_ALG_TYPE_ACOMPRESS_MASK 0x0000000e #define CRYPTO_ALG_LARVAL 0x00000010 #define CRYPTO_ALG_DEAD 0x00000020 #define CRYPTO_ALG_DYING 0x00000040 #define CRYPTO_ALG_ASYNC 0x00000080 /* * Set if the algorithm (or an algorithm which it uses) requires another * algorithm of the same type to handle corner cases. */ #define CRYPTO_ALG_NEED_FALLBACK 0x00000100 /* * Set if the algorithm has passed automated run-time testing. Note that * if there is no run-time testing for a given algorithm it is considered * to have passed. */ #define CRYPTO_ALG_TESTED 0x00000400 /* * Set if the algorithm is an instance that is built from templates. */ #define CRYPTO_ALG_INSTANCE 0x00000800 /* Set this bit if the algorithm provided is hardware accelerated but * not available to userspace via instruction set or so. */ #define CRYPTO_ALG_KERN_DRIVER_ONLY 0x00001000 /* * Mark a cipher as a service implementation only usable by another * cipher and never by a normal user of the kernel crypto API */ #define CRYPTO_ALG_INTERNAL 0x00002000 /* * Set if the algorithm has a ->setkey() method but can be used without * calling it first, i.e. there is a default key. */ #define CRYPTO_ALG_OPTIONAL_KEY 0x00004000 /* * Don't trigger module loading */ #define CRYPTO_NOLOAD 0x00008000 /* * The algorithm may allocate memory during request processing, i.e. during * encryption, decryption, or hashing. Users can request an algorithm with this * flag unset if they can't handle memory allocation failures. * * This flag is currently only implemented for algorithms of type "skcipher", * "aead", "ahash", "shash", and "cipher". Algorithms of other types might not * have this flag set even if they allocate memory. * * In some edge cases, algorithms can allocate memory regardless of this flag. * To avoid these cases, users must obey the following usage constraints: * skcipher: * - The IV buffer and all scatterlist elements must be aligned to the * algorithm's alignmask. * - If the data were to be divided into chunks of size * crypto_skcipher_walksize() (with any remainder going at the end), no * chunk can cross a page boundary or a scatterlist element boundary. * aead: * - The IV buffer and all scatterlist elements must be aligned to the * algorithm's alignmask. * - The first scatterlist element must contain all the associated data, * and its pages must be !PageHighMem. * - If the plaintext/ciphertext were to be divided into chunks of size * crypto_aead_walksize() (with the remainder going at the end), no chunk * can cross a page boundary or a scatterlist element boundary. * ahash: * - The result buffer must be aligned to the algorithm's alignmask. * - crypto_ahash_finup() must not be used unless the algorithm implements * ->finup() natively. */ #define CRYPTO_ALG_ALLOCATES_MEMORY 0x00010000 /* * Transform masks and values (for crt_flags). */ #define CRYPTO_TFM_NEED_KEY 0x00000001 #define CRYPTO_TFM_REQ_MASK 0x000fff00 #define CRYPTO_TFM_REQ_FORBID_WEAK_KEYS 0x00000100 #define CRYPTO_TFM_REQ_MAY_SLEEP 0x00000200 #define CRYPTO_TFM_REQ_MAY_BACKLOG 0x00000400 /* * Miscellaneous stuff. */ #define CRYPTO_MAX_ALG_NAME 128 /* * The macro CRYPTO_MINALIGN_ATTR (along with the void * type in the actual * declaration) is used to ensure that the crypto_tfm context structure is * aligned correctly for the given architecture so that there are no alignment * faults for C data types. On architectures that support non-cache coherent * DMA, such as ARM or arm64, it also takes into account the minimal alignment * that is required to ensure that the context struct member does not share any * cachelines with the rest of the struct. This is needed to ensure that cache * maintenance for non-coherent DMA (cache invalidation in particular) does not * affect data that may be accessed by the CPU concurrently. */ #define CRYPTO_MINALIGN ARCH_KMALLOC_MINALIGN #define CRYPTO_MINALIGN_ATTR __attribute__ ((__aligned__(CRYPTO_MINALIGN))) struct scatterlist; struct crypto_async_request; struct crypto_tfm; struct crypto_type; typedef struct crypto_async_request crypto_completion_data_t; typedef void (*crypto_completion_t)(struct crypto_async_request *req, int err); /** * DOC: Block Cipher Context Data Structures * * These data structures define the operating context for each block cipher * type. */ struct crypto_async_request { struct list_head list; crypto_completion_t complete; void *data; struct crypto_tfm *tfm; u32 flags; }; /** * DOC: Block Cipher Algorithm Definitions * * These data structures define modular crypto algorithm implementations, * managed via crypto_register_alg() and crypto_unregister_alg(). */ /** * struct cipher_alg - single-block symmetric ciphers definition * @cia_min_keysize: Minimum key size supported by the transformation. This is * the smallest key length supported by this transformation * algorithm. This must be set to one of the pre-defined * values as this is not hardware specific. Possible values * for this field can be found via git grep "_MIN_KEY_SIZE" * include/crypto/ * @cia_max_keysize: Maximum key size supported by the transformation. This is * the largest key length supported by this transformation * algorithm. This must be set to one of the pre-defined values * as this is not hardware specific. Possible values for this * field can be found via git grep "_MAX_KEY_SIZE" * include/crypto/ * @cia_setkey: Set key for the transformation. This function is used to either * program a supplied key into the hardware or store the key in the * transformation context for programming it later. Note that this * function does modify the transformation context. This function * can be called multiple times during the existence of the * transformation object, so one must make sure the key is properly * reprogrammed into the hardware. This function is also * responsible for checking the key length for validity. * @cia_encrypt: Encrypt a single block. This function is used to encrypt a * single block of data, which must be @cra_blocksize big. This * always operates on a full @cra_blocksize and it is not possible * to encrypt a block of smaller size. The supplied buffers must * therefore also be at least of @cra_blocksize size. Both the * input and output buffers are always aligned to @cra_alignmask. * In case either of the input or output buffer supplied by user * of the crypto API is not aligned to @cra_alignmask, the crypto * API will re-align the buffers. The re-alignment means that a * new buffer will be allocated, the data will be copied into the * new buffer, then the processing will happen on the new buffer, * then the data will be copied back into the original buffer and * finally the new buffer will be freed. In case a software * fallback was put in place in the @cra_init call, this function * might need to use the fallback if the algorithm doesn't support * all of the key sizes. In case the key was stored in * transformation context, the key might need to be re-programmed * into the hardware in this function. This function shall not * modify the transformation context, as this function may be * called in parallel with the same transformation object. * @cia_decrypt: Decrypt a single block. This is a reverse counterpart to * @cia_encrypt, and the conditions are exactly the same. * * All fields are mandatory and must be filled. */ struct cipher_alg { unsigned int cia_min_keysize; unsigned int cia_max_keysize; int (*cia_setkey)(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen); void (*cia_encrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src); void (*cia_decrypt)(struct crypto_tfm *tfm, u8 *dst, const u8 *src); }; /** * struct compress_alg - compression/decompression algorithm * @coa_compress: Compress a buffer of specified length, storing the resulting * data in the specified buffer. Return the length of the * compressed data in dlen. * @coa_decompress: Decompress the source buffer, storing the uncompressed * data in the specified buffer. The length of the data is * returned in dlen. * * All fields are mandatory. */ struct compress_alg { int (*coa_compress)(struct crypto_tfm *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int *dlen); int (*coa_decompress)(struct crypto_tfm *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int *dlen); }; #ifdef CONFIG_CRYPTO_STATS /* * struct crypto_istat_aead - statistics for AEAD algorithm * @encrypt_cnt: number of encrypt requests * @encrypt_tlen: total data size handled by encrypt requests * @decrypt_cnt: number of decrypt requests * @decrypt_tlen: total data size handled by decrypt requests * @err_cnt: number of error for AEAD requests */ struct crypto_istat_aead { atomic64_t encrypt_cnt; atomic64_t encrypt_tlen; atomic64_t decrypt_cnt; atomic64_t decrypt_tlen; atomic64_t err_cnt; }; /* * struct crypto_istat_akcipher - statistics for akcipher algorithm * @encrypt_cnt: number of encrypt requests * @encrypt_tlen: total data size handled by encrypt requests * @decrypt_cnt: number of decrypt requests * @decrypt_tlen: total data size handled by decrypt requests * @verify_cnt: number of verify operation * @sign_cnt: number of sign requests * @err_cnt: number of error for akcipher requests */ struct crypto_istat_akcipher { atomic64_t encrypt_cnt; atomic64_t encrypt_tlen; atomic64_t decrypt_cnt; atomic64_t decrypt_tlen; atomic64_t verify_cnt; atomic64_t sign_cnt; atomic64_t err_cnt; }; /* * struct crypto_istat_cipher - statistics for cipher algorithm * @encrypt_cnt: number of encrypt requests * @encrypt_tlen: total data size handled by encrypt requests * @decrypt_cnt: number of decrypt requests * @decrypt_tlen: total data size handled by decrypt requests * @err_cnt: number of error for cipher requests */ struct crypto_istat_cipher { atomic64_t encrypt_cnt; atomic64_t encrypt_tlen; atomic64_t decrypt_cnt; atomic64_t decrypt_tlen; atomic64_t err_cnt; }; /* * struct crypto_istat_compress - statistics for compress algorithm * @compress_cnt: number of compress requests * @compress_tlen: total data size handled by compress requests * @decompress_cnt: number of decompress requests * @decompress_tlen: total data size handled by decompress requests * @err_cnt: number of error for compress requests */ struct crypto_istat_compress { atomic64_t compress_cnt; atomic64_t compress_tlen; atomic64_t decompress_cnt; atomic64_t decompress_tlen; atomic64_t err_cnt; }; /* * struct crypto_istat_hash - statistics for has algorithm * @hash_cnt: number of hash requests * @hash_tlen: total data size hashed * @err_cnt: number of error for hash requests */ struct crypto_istat_hash { atomic64_t hash_cnt; atomic64_t hash_tlen; atomic64_t err_cnt; }; /* * struct crypto_istat_kpp - statistics for KPP algorithm * @setsecret_cnt: number of setsecrey operation * @generate_public_key_cnt: number of generate_public_key operation * @compute_shared_secret_cnt: number of compute_shared_secret operation * @err_cnt: number of error for KPP requests */ struct crypto_istat_kpp { atomic64_t setsecret_cnt; atomic64_t generate_public_key_cnt; atomic64_t compute_shared_secret_cnt; atomic64_t err_cnt; }; /* * struct crypto_istat_rng: statistics for RNG algorithm * @generate_cnt: number of RNG generate requests * @generate_tlen: total data size of generated data by the RNG * @seed_cnt: number of times the RNG was seeded * @err_cnt: number of error for RNG requests */ struct crypto_istat_rng { atomic64_t generate_cnt; atomic64_t generate_tlen; atomic64_t seed_cnt; atomic64_t err_cnt; }; #endif /* CONFIG_CRYPTO_STATS */ #define cra_cipher cra_u.cipher #define cra_compress cra_u.compress /** * struct crypto_alg - definition of a cryptograpic cipher algorithm * @cra_flags: Flags describing this transformation. See include/linux/crypto.h * CRYPTO_ALG_* flags for the flags which go in here. Those are * used for fine-tuning the description of the transformation * algorithm. * @cra_blocksize: Minimum block size of this transformation. The size in bytes * of the smallest possible unit which can be transformed with * this algorithm. The users must respect this value. * In case of HASH transformation, it is possible for a smaller * block than @cra_blocksize to be passed to the crypto API for * transformation, in case of any other transformation type, an * error will be returned upon any attempt to transform smaller * than @cra_blocksize chunks. * @cra_ctxsize: Size of the operational context of the transformation. This * value informs the kernel crypto API about the memory size * needed to be allocated for the transformation context. * @cra_alignmask: Alignment mask for the input and output data buffer. The data * buffer containing the input data for the algorithm must be * aligned to this alignment mask. The data buffer for the * output data must be aligned to this alignment mask. Note that * the Crypto API will do the re-alignment in software, but * only under special conditions and there is a performance hit. * The re-alignment happens at these occasions for different * @cra_u types: cipher -- For both input data and output data * buffer; ahash -- For output hash destination buf; shash -- * For output hash destination buf. * This is needed on hardware which is flawed by design and * cannot pick data from arbitrary addresses. * @cra_priority: Priority of this transformation implementation. In case * multiple transformations with same @cra_name are available to * the Crypto API, the kernel will use the one with highest * @cra_priority. * @cra_name: Generic name (usable by multiple implementations) of the * transformation algorithm. This is the name of the transformation * itself. This field is used by the kernel when looking up the * providers of particular transformation. * @cra_driver_name: Unique name of the transformation provider. This is the * name of the provider of the transformation. This can be any * arbitrary value, but in the usual case, this contains the * name of the chip or provider and the name of the * transformation algorithm. * @cra_type: Type of the cryptographic transformation. This is a pointer to * struct crypto_type, which implements callbacks common for all * transformation types. There are multiple options, such as * &crypto_skcipher_type, &crypto_ahash_type, &crypto_rng_type. * This field might be empty. In that case, there are no common * callbacks. This is the case for: cipher, compress, shash. * @cra_u: Callbacks implementing the transformation. This is a union of * multiple structures. Depending on the type of transformation selected * by @cra_type and @cra_flags above, the associated structure must be * filled with callbacks. This field might be empty. This is the case * for ahash, shash. * @cra_init: Initialize the cryptographic transformation object. This function * is used to initialize the cryptographic transformation object. * This function is called only once at the instantiation time, right * after the transformation context was allocated. In case the * cryptographic hardware has some special requirements which need to * be handled by software, this function shall check for the precise * requirement of the transformation and put any software fallbacks * in place. * @cra_exit: Deinitialize the cryptographic transformation object. This is a * counterpart to @cra_init, used to remove various changes set in * @cra_init. * @cra_u.cipher: Union member which contains a single-block symmetric cipher * definition. See @struct @cipher_alg. * @cra_u.compress: Union member which contains a (de)compression algorithm. * See @struct @compress_alg. * @cra_module: Owner of this transformation implementation. Set to THIS_MODULE * @cra_list: internally used * @cra_users: internally used * @cra_refcnt: internally used * @cra_destroy: internally used * * @stats: union of all possible crypto_istat_xxx structures * @stats.aead: statistics for AEAD algorithm * @stats.akcipher: statistics for akcipher algorithm * @stats.cipher: statistics for cipher algorithm * @stats.compress: statistics for compress algorithm * @stats.hash: statistics for hash algorithm * @stats.rng: statistics for rng algorithm * @stats.kpp: statistics for KPP algorithm * * The struct crypto_alg describes a generic Crypto API algorithm and is common * for all of the transformations. Any variable not documented here shall not * be used by a cipher implementation as it is internal to the Crypto API. */ struct crypto_alg { struct list_head cra_list; struct list_head cra_users; u32 cra_flags; unsigned int cra_blocksize; unsigned int cra_ctxsize; unsigned int cra_alignmask; int cra_priority; refcount_t cra_refcnt; char cra_name[CRYPTO_MAX_ALG_NAME]; char cra_driver_name[CRYPTO_MAX_ALG_NAME]; const struct crypto_type *cra_type; union { struct cipher_alg cipher; struct compress_alg compress; } cra_u; int (*cra_init)(struct crypto_tfm *tfm); void (*cra_exit)(struct crypto_tfm *tfm); void (*cra_destroy)(struct crypto_alg *alg); struct module *cra_module; #ifdef CONFIG_CRYPTO_STATS union { struct crypto_istat_aead aead; struct crypto_istat_akcipher akcipher; struct crypto_istat_cipher cipher; struct crypto_istat_compress compress; struct crypto_istat_hash hash; struct crypto_istat_rng rng; struct crypto_istat_kpp kpp; } stats; #endif /* CONFIG_CRYPTO_STATS */ } CRYPTO_MINALIGN_ATTR; #ifdef CONFIG_CRYPTO_STATS void crypto_stats_init(struct crypto_alg *alg); void crypto_stats_get(struct crypto_alg *alg); void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret); void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret); void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg); void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg); void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg); void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, struct crypto_alg *alg); void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg); void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg); void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg); void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg); void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret); void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret); void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret); void crypto_stats_rng_seed(struct crypto_alg *alg, int ret); void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret); void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg); void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg); #else static inline void crypto_stats_init(struct crypto_alg *alg) {} static inline void crypto_stats_get(struct crypto_alg *alg) {} static inline void crypto_stats_aead_encrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret) {} static inline void crypto_stats_aead_decrypt(unsigned int cryptlen, struct crypto_alg *alg, int ret) {} static inline void crypto_stats_ahash_update(unsigned int nbytes, int ret, struct crypto_alg *alg) {} static inline void crypto_stats_ahash_final(unsigned int nbytes, int ret, struct crypto_alg *alg) {} static inline void crypto_stats_akcipher_encrypt(unsigned int src_len, int ret, struct crypto_alg *alg) {} static inline void crypto_stats_akcipher_decrypt(unsigned int src_len, int ret, struct crypto_alg *alg) {} static inline void crypto_stats_akcipher_sign(int ret, struct crypto_alg *alg) {} static inline void crypto_stats_akcipher_verify(int ret, struct crypto_alg *alg) {} static inline void crypto_stats_compress(unsigned int slen, int ret, struct crypto_alg *alg) {} static inline void crypto_stats_decompress(unsigned int slen, int ret, struct crypto_alg *alg) {} static inline void crypto_stats_kpp_set_secret(struct crypto_alg *alg, int ret) {} static inline void crypto_stats_kpp_generate_public_key(struct crypto_alg *alg, int ret) {} static inline void crypto_stats_kpp_compute_shared_secret(struct crypto_alg *alg, int ret) {} static inline void crypto_stats_rng_seed(struct crypto_alg *alg, int ret) {} static inline void crypto_stats_rng_generate(struct crypto_alg *alg, unsigned int dlen, int ret) {} static inline void crypto_stats_skcipher_encrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg) {} static inline void crypto_stats_skcipher_decrypt(unsigned int cryptlen, int ret, struct crypto_alg *alg) {} #endif /* * A helper struct for waiting for completion of async crypto ops */ struct crypto_wait { struct completion completion; int err; }; /* * Macro for declaring a crypto op async wait object on stack */ #define DECLARE_CRYPTO_WAIT(_wait) \ struct crypto_wait _wait = { \ COMPLETION_INITIALIZER_ONSTACK((_wait).completion), 0 } /* * Async ops completion helper functioons */ static inline void *crypto_get_completion_data(crypto_completion_data_t *req) { return req->data; } void crypto_req_done(struct crypto_async_request *req, int err); static inline int crypto_wait_req(int err, struct crypto_wait *wait) { switch (err) { case -EINPROGRESS: case -EBUSY: wait_for_completion(&wait->completion); reinit_completion(&wait->completion); err = wait->err; break; } return err; } static inline void crypto_init_wait(struct crypto_wait *wait) { init_completion(&wait->completion); } /* * Algorithm registration interface. */ int crypto_register_alg(struct crypto_alg *alg); void crypto_unregister_alg(struct crypto_alg *alg); int crypto_register_algs(struct crypto_alg *algs, int count); void crypto_unregister_algs(struct crypto_alg *algs, int count); /* * Algorithm query interface. */ int crypto_has_alg(const char *name, u32 type, u32 mask); /* * Transforms: user-instantiated objects which encapsulate algorithms * and core processing logic. Managed via crypto_alloc_*() and * crypto_free_*(), as well as the various helpers below. */ struct crypto_tfm { u32 crt_flags; int node; void (*exit)(struct crypto_tfm *tfm); struct crypto_alg *__crt_alg; void *__crt_ctx[] CRYPTO_MINALIGN_ATTR; }; struct crypto_comp { struct crypto_tfm base; }; /* * Transform user interface. */ struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask); void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm); static inline void crypto_free_tfm(struct crypto_tfm *tfm) { return crypto_destroy_tfm(tfm, tfm); } int alg_test(const char *driver, const char *alg, u32 type, u32 mask); /* * Transform helpers which query the underlying algorithm. */ static inline const char *crypto_tfm_alg_name(struct crypto_tfm *tfm) { return tfm->__crt_alg->cra_name; } static inline const char *crypto_tfm_alg_driver_name(struct crypto_tfm *tfm) { return tfm->__crt_alg->cra_driver_name; } static inline int crypto_tfm_alg_priority(struct crypto_tfm *tfm) { return tfm->__crt_alg->cra_priority; } static inline u32 crypto_tfm_alg_type(struct crypto_tfm *tfm) { return tfm->__crt_alg->cra_flags & CRYPTO_ALG_TYPE_MASK; } static inline unsigned int crypto_tfm_alg_blocksize(struct crypto_tfm *tfm) { return tfm->__crt_alg->cra_blocksize; } static inline unsigned int crypto_tfm_alg_alignmask(struct crypto_tfm *tfm) { return tfm->__crt_alg->cra_alignmask; } static inline u32 crypto_tfm_get_flags(struct crypto_tfm *tfm) { return tfm->crt_flags; } static inline void crypto_tfm_set_flags(struct crypto_tfm *tfm, u32 flags) { tfm->crt_flags |= flags; } static inline void crypto_tfm_clear_flags(struct crypto_tfm *tfm, u32 flags) { tfm->crt_flags &= ~flags; } static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm) { return tfm->__crt_ctx; } static inline unsigned int crypto_tfm_ctx_alignment(void) { struct crypto_tfm *tfm; return __alignof__(tfm->__crt_ctx); } static inline struct crypto_comp *__crypto_comp_cast(struct crypto_tfm *tfm) { return (struct crypto_comp *)tfm; } static inline struct crypto_comp *crypto_alloc_comp(const char *alg_name, u32 type, u32 mask) { type &= ~CRYPTO_ALG_TYPE_MASK; type |= CRYPTO_ALG_TYPE_COMPRESS; mask |= CRYPTO_ALG_TYPE_MASK; return __crypto_comp_cast(crypto_alloc_base(alg_name, type, mask)); } static inline struct crypto_tfm *crypto_comp_tfm(struct crypto_comp *tfm) { return &tfm->base; } static inline void crypto_free_comp(struct crypto_comp *tfm) { crypto_free_tfm(crypto_comp_tfm(tfm)); } static inline int crypto_has_comp(const char *alg_name, u32 type, u32 mask) { type &= ~CRYPTO_ALG_TYPE_MASK; type |= CRYPTO_ALG_TYPE_COMPRESS; mask |= CRYPTO_ALG_TYPE_MASK; return crypto_has_alg(alg_name, type, mask); } static inline const char *crypto_comp_name(struct crypto_comp *tfm) { return crypto_tfm_alg_name(crypto_comp_tfm(tfm)); } int crypto_comp_compress(struct crypto_comp *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int *dlen); int crypto_comp_decompress(struct crypto_comp *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int *dlen); #endif /* _LINUX_CRYPTO_H */ |
359 7 7 42 50 46 38 4 39 42 42 42 12 1 4 2 34 34 30 14 34 34 34 2 28 4 17 17 8 2 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 | // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * */ #include <linux/fs.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" /* * al_is_valid_le * * Return: True if @le is valid. */ static inline bool al_is_valid_le(const struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le) { if (!le || !ni->attr_list.le || !ni->attr_list.size) return false; return PtrOffset(ni->attr_list.le, le) + le16_to_cpu(le->size) <= ni->attr_list.size; } void al_destroy(struct ntfs_inode *ni) { run_close(&ni->attr_list.run); kvfree(ni->attr_list.le); ni->attr_list.le = NULL; ni->attr_list.size = 0; ni->attr_list.dirty = false; } /* * ntfs_load_attr_list * * This method makes sure that the ATTRIB list, if present, * has been properly set up. */ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr) { int err; size_t lsize; void *le = NULL; if (ni->attr_list.size) return 0; if (!attr->non_res) { lsize = le32_to_cpu(attr->res.data_size); /* attr is resident: lsize < record_size (1K or 4K) */ le = kvmalloc(al_aligned(lsize), GFP_KERNEL); if (!le) { err = -ENOMEM; goto out; } memcpy(le, resident_data(attr), lsize); } else if (attr->nres.svcn) { err = -EINVAL; goto out; } else { u16 run_off = le16_to_cpu(attr->nres.run_off); lsize = le64_to_cpu(attr->nres.data_size); run_init(&ni->attr_list.run); if (run_off > le32_to_cpu(attr->size)) { err = -EINVAL; goto out; } err = run_unpack_ex(&ni->attr_list.run, ni->mi.sbi, ni->mi.rno, 0, le64_to_cpu(attr->nres.evcn), 0, Add2Ptr(attr, run_off), le32_to_cpu(attr->size) - run_off); if (err < 0) goto out; /* attr is nonresident. * The worst case: * 1T (2^40) extremely fragmented file. * cluster = 4K (2^12) => 2^28 fragments * 2^9 fragments per one record => 2^19 records * 2^5 bytes of ATTR_LIST_ENTRY per one record => 2^24 bytes. * * the result is 16M bytes per attribute list. * Use kvmalloc to allocate in range [several Kbytes - dozen Mbytes] */ le = kvmalloc(al_aligned(lsize), GFP_KERNEL); if (!le) { err = -ENOMEM; goto out; } err = ntfs_read_run_nb(ni->mi.sbi, &ni->attr_list.run, 0, le, lsize, NULL); if (err) goto out; } ni->attr_list.size = lsize; ni->attr_list.le = le; return 0; out: ni->attr_list.le = le; al_destroy(ni); return err; } /* * al_enumerate * * Return: * * The next list le. * * If @le is NULL then return the first le. */ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le) { size_t off; u16 sz; const unsigned le_min_size = le_size(0); if (!le) { le = ni->attr_list.le; } else { sz = le16_to_cpu(le->size); if (sz < le_min_size) { /* Impossible 'cause we should not return such le. */ return NULL; } le = Add2Ptr(le, sz); } /* Check boundary. */ off = PtrOffset(ni->attr_list.le, le); if (off + le_min_size > ni->attr_list.size) { /* The regular end of list. */ return NULL; } sz = le16_to_cpu(le->size); /* Check le for errors. */ if (sz < le_min_size || off + sz > ni->attr_list.size || sz < le->name_off + le->name_len * sizeof(short)) { return NULL; } return le; } /* * al_find_le * * Find the first le in the list which matches type, name and VCN. * * Return: NULL if not found. */ struct ATTR_LIST_ENTRY *al_find_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, const struct ATTRIB *attr) { CLST svcn = attr_svcn(attr); return al_find_ex(ni, le, attr->type, attr_name(attr), attr->name_len, &svcn); } /* * al_find_ex * * Find the first le in the list which matches type, name and VCN. * * Return: NULL if not found. */ struct ATTR_LIST_ENTRY *al_find_ex(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, enum ATTR_TYPE type, const __le16 *name, u8 name_len, const CLST *vcn) { struct ATTR_LIST_ENTRY *ret = NULL; u32 type_in = le32_to_cpu(type); while ((le = al_enumerate(ni, le))) { u64 le_vcn; int diff = le32_to_cpu(le->type) - type_in; /* List entries are sorted by type, name and VCN. */ if (diff < 0) continue; if (diff > 0) return ret; if (le->name_len != name_len) continue; le_vcn = le64_to_cpu(le->vcn); if (!le_vcn) { /* * Compare entry names only for entry with vcn == 0. */ diff = ntfs_cmp_names(le_name(le), name_len, name, name_len, ni->mi.sbi->upcase, true); if (diff < 0) continue; if (diff > 0) return ret; } if (!vcn) return le; if (*vcn == le_vcn) return le; if (*vcn < le_vcn) return ret; ret = le; } return ret; } /* * al_find_le_to_insert * * Find the first list entry which matches type, name and VCN. */ static struct ATTR_LIST_ENTRY *al_find_le_to_insert(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, CLST vcn) { struct ATTR_LIST_ENTRY *le = NULL, *prev; u32 type_in = le32_to_cpu(type); /* List entries are sorted by type, name and VCN. */ while ((le = al_enumerate(ni, prev = le))) { int diff = le32_to_cpu(le->type) - type_in; if (diff < 0) continue; if (diff > 0) return le; if (!le->vcn) { /* * Compare entry names only for entry with vcn == 0. */ diff = ntfs_cmp_names(le_name(le), le->name_len, name, name_len, ni->mi.sbi->upcase, true); if (diff < 0) continue; if (diff > 0) return le; } if (le64_to_cpu(le->vcn) >= vcn) return le; } return prev ? Add2Ptr(prev, le16_to_cpu(prev->size)) : ni->attr_list.le; } /* * al_add_le * * Add an "attribute list entry" to the list. */ int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, CLST svcn, __le16 id, const struct MFT_REF *ref, struct ATTR_LIST_ENTRY **new_le) { int err; struct ATTRIB *attr; struct ATTR_LIST_ENTRY *le; size_t off; u16 sz; size_t asize, new_asize, old_size; u64 new_size; typeof(ni->attr_list) *al = &ni->attr_list; /* * Compute the size of the new 'le' */ sz = le_size(name_len); old_size = al->size; new_size = old_size + sz; asize = al_aligned(old_size); new_asize = al_aligned(new_size); /* Scan forward to the point at which the new 'le' should be inserted. */ le = al_find_le_to_insert(ni, type, name, name_len, svcn); off = PtrOffset(al->le, le); if (new_size > asize) { void *ptr = kmalloc(new_asize, GFP_NOFS); if (!ptr) return -ENOMEM; memcpy(ptr, al->le, off); memcpy(Add2Ptr(ptr, off + sz), le, old_size - off); le = Add2Ptr(ptr, off); kvfree(al->le); al->le = ptr; } else { memmove(Add2Ptr(le, sz), le, old_size - off); } *new_le = le; al->size = new_size; le->type = type; le->size = cpu_to_le16(sz); le->name_len = name_len; le->name_off = offsetof(struct ATTR_LIST_ENTRY, name); le->vcn = cpu_to_le64(svcn); le->ref = *ref; le->id = id; memcpy(le->name, name, sizeof(short) * name_len); err = attr_set_size(ni, ATTR_LIST, NULL, 0, &al->run, new_size, &new_size, true, &attr); if (err) { /* Undo memmove above. */ memmove(le, Add2Ptr(le, sz), old_size - off); al->size = old_size; return err; } al->dirty = true; if (attr && attr->non_res) { err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le, al->size, 0); if (err) return err; al->dirty = false; } return 0; } /* * al_remove_le - Remove @le from attribute list. */ bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le) { u16 size; size_t off; typeof(ni->attr_list) *al = &ni->attr_list; if (!al_is_valid_le(ni, le)) return false; /* Save on stack the size of 'le' */ size = le16_to_cpu(le->size); off = PtrOffset(al->le, le); memmove(le, Add2Ptr(le, size), al->size - (off + size)); al->size -= size; al->dirty = true; return true; } /* * al_delete_le - Delete first le from the list which matches its parameters. */ bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, const __le16 *name, size_t name_len, const struct MFT_REF *ref) { u16 size; struct ATTR_LIST_ENTRY *le; size_t off; typeof(ni->attr_list) *al = &ni->attr_list; /* Scan forward to the first le that matches the input. */ le = al_find_ex(ni, NULL, type, name, name_len, &vcn); if (!le) return false; off = PtrOffset(al->le, le); next: if (off >= al->size) return false; if (le->type != type) return false; if (le->name_len != name_len) return false; if (name_len && ntfs_cmp_names(le_name(le), name_len, name, name_len, ni->mi.sbi->upcase, true)) return false; if (le64_to_cpu(le->vcn) != vcn) return false; /* * The caller specified a segment reference, so we have to * scan through the matching entries until we find that segment * reference or we run of matching entries. */ if (ref && memcmp(ref, &le->ref, sizeof(*ref))) { off += le16_to_cpu(le->size); le = Add2Ptr(al->le, off); goto next; } /* Save on stack the size of 'le'. */ size = le16_to_cpu(le->size); /* Delete the le. */ memmove(le, Add2Ptr(le, size), al->size - (off + size)); al->size -= size; al->dirty = true; return true; } int al_update(struct ntfs_inode *ni, int sync) { int err; struct ATTRIB *attr; typeof(ni->attr_list) *al = &ni->attr_list; if (!al->dirty || !al->size) return 0; /* * Attribute list increased on demand in al_add_le. * Attribute list decreased here. */ err = attr_set_size(ni, ATTR_LIST, NULL, 0, &al->run, al->size, NULL, false, &attr); if (err) goto out; if (!attr->non_res) { memcpy(resident_data(attr), al->le, al->size); } else { err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le, al->size, sync); if (err) goto out; attr->nres.valid_size = attr->nres.data_size; } ni->mi.dirty = true; al->dirty = false; out: return err; } |
7 3 7 1 3 3 4 2 5 1 5 1 6 5 1 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 | // SPDX-License-Identifier: GPL-2.0-or-later /* * inode.c - basic inode and dentry operations. * * Based on sysfs: * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel * * configfs Copyright (C) 2005 Oracle. All rights reserved. * * Please see Documentation/filesystems/configfs.rst for more * information. */ #undef DEBUG #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/backing-dev.h> #include <linux/capability.h> #include <linux/sched.h> #include <linux/lockdep.h> #include <linux/slab.h> #include <linux/configfs.h> #include "configfs_internal.h" #ifdef CONFIG_LOCKDEP static struct lock_class_key default_group_class[MAX_LOCK_DEPTH]; #endif static const struct inode_operations configfs_inode_operations ={ .setattr = configfs_setattr, }; int configfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *iattr) { struct inode * inode = d_inode(dentry); struct configfs_dirent * sd = dentry->d_fsdata; struct iattr * sd_iattr; unsigned int ia_valid = iattr->ia_valid; int error; if (!sd) return -EINVAL; sd_iattr = sd->s_iattr; if (!sd_iattr) { /* setting attributes for the first time, allocate now */ sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); if (!sd_iattr) return -ENOMEM; /* assign default attributes */ sd_iattr->ia_mode = sd->s_mode; sd_iattr->ia_uid = GLOBAL_ROOT_UID; sd_iattr->ia_gid = GLOBAL_ROOT_GID; sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = current_time(inode); sd->s_iattr = sd_iattr; } /* attributes were changed atleast once in past */ error = simple_setattr(mnt_userns, dentry, iattr); if (error) return error; if (ia_valid & ATTR_UID) sd_iattr->ia_uid = iattr->ia_uid; if (ia_valid & ATTR_GID) sd_iattr->ia_gid = iattr->ia_gid; if (ia_valid & ATTR_ATIME) sd_iattr->ia_atime = iattr->ia_atime; if (ia_valid & ATTR_MTIME) sd_iattr->ia_mtime = iattr->ia_mtime; if (ia_valid & ATTR_CTIME) sd_iattr->ia_ctime = iattr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = iattr->ia_mode; if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; sd_iattr->ia_mode = sd->s_mode = mode; } return error; } static inline void set_default_inode_attr(struct inode * inode, umode_t mode) { inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); } static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) { inode->i_mode = iattr->ia_mode; inode->i_uid = iattr->ia_uid; inode->i_gid = iattr->ia_gid; inode->i_atime = iattr->ia_atime; inode->i_mtime = iattr->ia_mtime; inode->i_ctime = iattr->ia_ctime; } struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd, struct super_block *s) { struct inode * inode = new_inode(s); if (inode) { inode->i_ino = get_next_ino(); inode->i_mapping->a_ops = &ram_aops; inode->i_op = &configfs_inode_operations; if (sd->s_iattr) { /* sysfs_dirent has non-default attributes * get them for the new inode from persistent copy * in sysfs_dirent */ set_inode_attr(inode, sd->s_iattr); } else set_default_inode_attr(inode, mode); } return inode; } #ifdef CONFIG_LOCKDEP static void configfs_set_inode_lock_class(struct configfs_dirent *sd, struct inode *inode) { int depth = sd->s_depth; if (depth > 0) { if (depth <= ARRAY_SIZE(default_group_class)) { lockdep_set_class(&inode->i_rwsem, &default_group_class[depth - 1]); } else { /* * In practice the maximum level of locking depth is * already reached. Just inform about possible reasons. */ pr_info("Too many levels of inodes for the locking correctness validator.\n"); pr_info("Spurious warnings may appear.\n"); } } } #else /* CONFIG_LOCKDEP */ static void configfs_set_inode_lock_class(struct configfs_dirent *sd, struct inode *inode) { } #endif /* CONFIG_LOCKDEP */ struct inode *configfs_create(struct dentry *dentry, umode_t mode) { struct inode *inode = NULL; struct configfs_dirent *sd; struct inode *p_inode; if (!dentry) return ERR_PTR(-ENOENT); if (d_really_is_positive(dentry)) return ERR_PTR(-EEXIST); sd = dentry->d_fsdata; inode = configfs_new_inode(mode, sd, dentry->d_sb); if (!inode) return ERR_PTR(-ENOMEM); p_inode = d_inode(dentry->d_parent); p_inode->i_mtime = p_inode->i_ctime = current_time(p_inode); configfs_set_inode_lock_class(sd, inode); return inode; } /* * Get the name for corresponding element represented by the given configfs_dirent */ const unsigned char * configfs_get_name(struct configfs_dirent *sd) { struct configfs_attribute *attr; BUG_ON(!sd || !sd->s_element); /* These always have a dentry, so use that */ if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK)) return sd->s_dentry->d_name.name; if (sd->s_type & (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR)) { attr = sd->s_element; return attr->ca_name; } return NULL; } /* * Unhashes the dentry corresponding to given configfs_dirent * Called with parent inode's i_mutex held. */ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) { struct dentry * dentry = sd->s_dentry; if (dentry) { spin_lock(&dentry->d_lock); if (simple_positive(dentry)) { dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); simple_unlink(d_inode(parent), dentry); } else spin_unlock(&dentry->d_lock); } } void configfs_hash_and_remove(struct dentry * dir, const char * name) { struct configfs_dirent * sd; struct configfs_dirent * parent_sd = dir->d_fsdata; if (d_really_is_negative(dir)) /* no inode means this hasn't been made visible yet */ return; inode_lock(d_inode(dir)); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) continue; if (!strcmp(configfs_get_name(sd), name)) { spin_lock(&configfs_dirent_lock); list_del_init(&sd->s_sibling); spin_unlock(&configfs_dirent_lock); configfs_drop_dentry(sd, dir); configfs_put(sd); break; } } inode_unlock(d_inode(dir)); } |
130 62 2 111 77 111 77 84 40 4 1 1 34 34 34 34 38 3 35 38 74 40 40 34 34 16 130 104 107 104 2 20 35 6 61 39 5 58 44 47 47 27 25 20 8 35 26 27 48 48 3 27 16 6 22 6 17 22 34 34 74 5 34 25 25 18 1 1 1 2 13 12 14 3 1 21 26 27 19 25 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 | // SPDX-License-Identifier: GPL-2.0-or-later /* * OSS compatible sequencer driver * * MIDI device handlers * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de> */ #include <sound/asoundef.h> #include "seq_oss_midi.h" #include "seq_oss_readq.h" #include "seq_oss_timer.h" #include "seq_oss_event.h" #include <sound/seq_midi_event.h> #include "../seq_lock.h" #include <linux/init.h> #include <linux/slab.h> #include <linux/nospec.h> /* * constants */ #define SNDRV_SEQ_OSS_MAX_MIDI_NAME 30 /* * definition of midi device record */ struct seq_oss_midi { int seq_device; /* device number */ int client; /* sequencer client number */ int port; /* sequencer port number */ unsigned int flags; /* port capability */ int opened; /* flag for opening */ unsigned char name[SNDRV_SEQ_OSS_MAX_MIDI_NAME]; struct snd_midi_event *coder; /* MIDI event coder */ struct seq_oss_devinfo *devinfo; /* assigned OSSseq device */ snd_use_lock_t use_lock; struct mutex open_mutex; }; /* * midi device table */ static int max_midi_devs; static struct seq_oss_midi *midi_devs[SNDRV_SEQ_OSS_MAX_MIDI_DEVS]; static DEFINE_SPINLOCK(register_lock); /* * prototypes */ static struct seq_oss_midi *get_mdev(int dev); static struct seq_oss_midi *get_mididev(struct seq_oss_devinfo *dp, int dev); static int send_synth_event(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, int dev); static int send_midi_event(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, struct seq_oss_midi *mdev); /* * look up the existing ports * this looks a very exhausting job. */ int snd_seq_oss_midi_lookup_ports(int client) { struct snd_seq_client_info *clinfo; struct snd_seq_port_info *pinfo; clinfo = kzalloc(sizeof(*clinfo), GFP_KERNEL); pinfo = kzalloc(sizeof(*pinfo), GFP_KERNEL); if (! clinfo || ! pinfo) { kfree(clinfo); kfree(pinfo); return -ENOMEM; } clinfo->client = -1; while (snd_seq_kernel_client_ctl(client, SNDRV_SEQ_IOCTL_QUERY_NEXT_CLIENT, clinfo) == 0) { if (clinfo->client == client) continue; /* ignore myself */ pinfo->addr.client = clinfo->client; pinfo->addr.port = -1; while (snd_seq_kernel_client_ctl(client, SNDRV_SEQ_IOCTL_QUERY_NEXT_PORT, pinfo) == 0) snd_seq_oss_midi_check_new_port(pinfo); } kfree(clinfo); kfree(pinfo); return 0; } /* */ static struct seq_oss_midi * get_mdev(int dev) { struct seq_oss_midi *mdev; unsigned long flags; spin_lock_irqsave(®ister_lock, flags); mdev = midi_devs[dev]; if (mdev) snd_use_lock_use(&mdev->use_lock); spin_unlock_irqrestore(®ister_lock, flags); return mdev; } /* * look for the identical slot */ static struct seq_oss_midi * find_slot(int client, int port) { int i; struct seq_oss_midi *mdev; unsigned long flags; spin_lock_irqsave(®ister_lock, flags); for (i = 0; i < max_midi_devs; i++) { mdev = midi_devs[i]; if (mdev && mdev->client == client && mdev->port == port) { /* found! */ snd_use_lock_use(&mdev->use_lock); spin_unlock_irqrestore(®ister_lock, flags); return mdev; } } spin_unlock_irqrestore(®ister_lock, flags); return NULL; } #define PERM_WRITE (SNDRV_SEQ_PORT_CAP_WRITE|SNDRV_SEQ_PORT_CAP_SUBS_WRITE) #define PERM_READ (SNDRV_SEQ_PORT_CAP_READ|SNDRV_SEQ_PORT_CAP_SUBS_READ) /* * register a new port if it doesn't exist yet */ int snd_seq_oss_midi_check_new_port(struct snd_seq_port_info *pinfo) { int i; struct seq_oss_midi *mdev; unsigned long flags; /* the port must include generic midi */ if (! (pinfo->type & SNDRV_SEQ_PORT_TYPE_MIDI_GENERIC)) return 0; /* either read or write subscribable */ if ((pinfo->capability & PERM_WRITE) != PERM_WRITE && (pinfo->capability & PERM_READ) != PERM_READ) return 0; /* * look for the identical slot */ mdev = find_slot(pinfo->addr.client, pinfo->addr.port); if (mdev) { /* already exists */ snd_use_lock_free(&mdev->use_lock); return 0; } /* * allocate midi info record */ mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) return -ENOMEM; /* copy the port information */ mdev->client = pinfo->addr.client; mdev->port = pinfo->addr.port; mdev->flags = pinfo->capability; mdev->opened = 0; snd_use_lock_init(&mdev->use_lock); mutex_init(&mdev->open_mutex); /* copy and truncate the name of synth device */ strscpy(mdev->name, pinfo->name, sizeof(mdev->name)); /* create MIDI coder */ if (snd_midi_event_new(MAX_MIDI_EVENT_BUF, &mdev->coder) < 0) { pr_err("ALSA: seq_oss: can't malloc midi coder\n"); kfree(mdev); return -ENOMEM; } /* OSS sequencer adds running status to all sequences */ snd_midi_event_no_status(mdev->coder, 1); /* * look for en empty slot */ spin_lock_irqsave(®ister_lock, flags); for (i = 0; i < max_midi_devs; i++) { if (midi_devs[i] == NULL) break; } if (i >= max_midi_devs) { if (max_midi_devs >= SNDRV_SEQ_OSS_MAX_MIDI_DEVS) { spin_unlock_irqrestore(®ister_lock, flags); snd_midi_event_free(mdev->coder); kfree(mdev); return -ENOMEM; } max_midi_devs++; } mdev->seq_device = i; midi_devs[mdev->seq_device] = mdev; spin_unlock_irqrestore(®ister_lock, flags); return 0; } /* * release the midi device if it was registered */ int snd_seq_oss_midi_check_exit_port(int client, int port) { struct seq_oss_midi *mdev; unsigned long flags; int index; mdev = find_slot(client, port); if (mdev) { spin_lock_irqsave(®ister_lock, flags); midi_devs[mdev->seq_device] = NULL; spin_unlock_irqrestore(®ister_lock, flags); snd_use_lock_free(&mdev->use_lock); snd_use_lock_sync(&mdev->use_lock); snd_midi_event_free(mdev->coder); kfree(mdev); } spin_lock_irqsave(®ister_lock, flags); for (index = max_midi_devs - 1; index >= 0; index--) { if (midi_devs[index]) break; } max_midi_devs = index + 1; spin_unlock_irqrestore(®ister_lock, flags); return 0; } /* * release the midi device if it was registered */ void snd_seq_oss_midi_clear_all(void) { int i; struct seq_oss_midi *mdev; unsigned long flags; spin_lock_irqsave(®ister_lock, flags); for (i = 0; i < max_midi_devs; i++) { mdev = midi_devs[i]; if (mdev) { snd_midi_event_free(mdev->coder); kfree(mdev); midi_devs[i] = NULL; } } max_midi_devs = 0; spin_unlock_irqrestore(®ister_lock, flags); } /* * set up midi tables */ void snd_seq_oss_midi_setup(struct seq_oss_devinfo *dp) { spin_lock_irq(®ister_lock); dp->max_mididev = max_midi_devs; spin_unlock_irq(®ister_lock); } /* * clean up midi tables */ void snd_seq_oss_midi_cleanup(struct seq_oss_devinfo *dp) { int i; for (i = 0; i < dp->max_mididev; i++) snd_seq_oss_midi_close(dp, i); dp->max_mididev = 0; } /* * open all midi devices. ignore errors. */ void snd_seq_oss_midi_open_all(struct seq_oss_devinfo *dp, int file_mode) { int i; for (i = 0; i < dp->max_mididev; i++) snd_seq_oss_midi_open(dp, i, file_mode); } /* * get the midi device information */ static struct seq_oss_midi * get_mididev(struct seq_oss_devinfo *dp, int dev) { if (dev < 0 || dev >= dp->max_mididev) return NULL; dev = array_index_nospec(dev, dp->max_mididev); return get_mdev(dev); } /* * open the midi device if not opened yet */ int snd_seq_oss_midi_open(struct seq_oss_devinfo *dp, int dev, int fmode) { int perm; struct seq_oss_midi *mdev; struct snd_seq_port_subscribe subs; int err; mdev = get_mididev(dp, dev); if (!mdev) return -ENODEV; mutex_lock(&mdev->open_mutex); /* already used? */ if (mdev->opened && mdev->devinfo != dp) { err = -EBUSY; goto unlock; } perm = 0; if (is_write_mode(fmode)) perm |= PERM_WRITE; if (is_read_mode(fmode)) perm |= PERM_READ; perm &= mdev->flags; if (perm == 0) { err = -ENXIO; goto unlock; } /* already opened? */ if ((mdev->opened & perm) == perm) { err = 0; goto unlock; } perm &= ~mdev->opened; memset(&subs, 0, sizeof(subs)); if (perm & PERM_WRITE) { subs.sender = dp->addr; subs.dest.client = mdev->client; subs.dest.port = mdev->port; if (snd_seq_kernel_client_ctl(dp->cseq, SNDRV_SEQ_IOCTL_SUBSCRIBE_PORT, &subs) >= 0) mdev->opened |= PERM_WRITE; } if (perm & PERM_READ) { subs.sender.client = mdev->client; subs.sender.port = mdev->port; subs.dest = dp->addr; subs.flags = SNDRV_SEQ_PORT_SUBS_TIMESTAMP; subs.queue = dp->queue; /* queue for timestamps */ if (snd_seq_kernel_client_ctl(dp->cseq, SNDRV_SEQ_IOCTL_SUBSCRIBE_PORT, &subs) >= 0) mdev->opened |= PERM_READ; } if (! mdev->opened) { err = -ENXIO; goto unlock; } mdev->devinfo = dp; err = 0; unlock: mutex_unlock(&mdev->open_mutex); snd_use_lock_free(&mdev->use_lock); return err; } /* * close the midi device if already opened */ int snd_seq_oss_midi_close(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_midi *mdev; struct snd_seq_port_subscribe subs; mdev = get_mididev(dp, dev); if (!mdev) return -ENODEV; mutex_lock(&mdev->open_mutex); if (!mdev->opened || mdev->devinfo != dp) goto unlock; memset(&subs, 0, sizeof(subs)); if (mdev->opened & PERM_WRITE) { subs.sender = dp->addr; subs.dest.client = mdev->client; subs.dest.port = mdev->port; snd_seq_kernel_client_ctl(dp->cseq, SNDRV_SEQ_IOCTL_UNSUBSCRIBE_PORT, &subs); } if (mdev->opened & PERM_READ) { subs.sender.client = mdev->client; subs.sender.port = mdev->port; subs.dest = dp->addr; snd_seq_kernel_client_ctl(dp->cseq, SNDRV_SEQ_IOCTL_UNSUBSCRIBE_PORT, &subs); } mdev->opened = 0; mdev->devinfo = NULL; unlock: mutex_unlock(&mdev->open_mutex); snd_use_lock_free(&mdev->use_lock); return 0; } /* * change seq capability flags to file mode flags */ int snd_seq_oss_midi_filemode(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_midi *mdev; int mode; mdev = get_mididev(dp, dev); if (!mdev) return 0; mode = 0; if (mdev->opened & PERM_WRITE) mode |= SNDRV_SEQ_OSS_FILE_WRITE; if (mdev->opened & PERM_READ) mode |= SNDRV_SEQ_OSS_FILE_READ; snd_use_lock_free(&mdev->use_lock); return mode; } /* * reset the midi device and close it: * so far, only close the device. */ void snd_seq_oss_midi_reset(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_midi *mdev; mdev = get_mididev(dp, dev); if (!mdev) return; if (! mdev->opened) { snd_use_lock_free(&mdev->use_lock); return; } if (mdev->opened & PERM_WRITE) { struct snd_seq_event ev; int c; memset(&ev, 0, sizeof(ev)); ev.dest.client = mdev->client; ev.dest.port = mdev->port; ev.queue = dp->queue; ev.source.port = dp->port; if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_SYNTH) { ev.type = SNDRV_SEQ_EVENT_SENSING; snd_seq_oss_dispatch(dp, &ev, 0, 0); } for (c = 0; c < 16; c++) { ev.type = SNDRV_SEQ_EVENT_CONTROLLER; ev.data.control.channel = c; ev.data.control.param = MIDI_CTL_ALL_NOTES_OFF; snd_seq_oss_dispatch(dp, &ev, 0, 0); if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) { ev.data.control.param = MIDI_CTL_RESET_CONTROLLERS; snd_seq_oss_dispatch(dp, &ev, 0, 0); ev.type = SNDRV_SEQ_EVENT_PITCHBEND; ev.data.control.value = 0; snd_seq_oss_dispatch(dp, &ev, 0, 0); } } } // snd_seq_oss_midi_close(dp, dev); snd_use_lock_free(&mdev->use_lock); } /* * get client/port of the specified MIDI device */ void snd_seq_oss_midi_get_addr(struct seq_oss_devinfo *dp, int dev, struct snd_seq_addr *addr) { struct seq_oss_midi *mdev; mdev = get_mididev(dp, dev); if (!mdev) return; addr->client = mdev->client; addr->port = mdev->port; snd_use_lock_free(&mdev->use_lock); } /* * input callback - this can be atomic */ int snd_seq_oss_midi_input(struct snd_seq_event *ev, int direct, void *private_data) { struct seq_oss_devinfo *dp = (struct seq_oss_devinfo *)private_data; struct seq_oss_midi *mdev; int rc; if (dp->readq == NULL) return 0; mdev = find_slot(ev->source.client, ev->source.port); if (!mdev) return 0; if (! (mdev->opened & PERM_READ)) { snd_use_lock_free(&mdev->use_lock); return 0; } if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) rc = send_synth_event(dp, ev, mdev->seq_device); else rc = send_midi_event(dp, ev, mdev); snd_use_lock_free(&mdev->use_lock); return rc; } /* * convert ALSA sequencer event to OSS synth event */ static int send_synth_event(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, int dev) { union evrec ossev; memset(&ossev, 0, sizeof(ossev)); switch (ev->type) { case SNDRV_SEQ_EVENT_NOTEON: ossev.v.cmd = MIDI_NOTEON; break; case SNDRV_SEQ_EVENT_NOTEOFF: ossev.v.cmd = MIDI_NOTEOFF; break; case SNDRV_SEQ_EVENT_KEYPRESS: ossev.v.cmd = MIDI_KEY_PRESSURE; break; case SNDRV_SEQ_EVENT_CONTROLLER: ossev.l.cmd = MIDI_CTL_CHANGE; break; case SNDRV_SEQ_EVENT_PGMCHANGE: ossev.l.cmd = MIDI_PGM_CHANGE; break; case SNDRV_SEQ_EVENT_CHANPRESS: ossev.l.cmd = MIDI_CHN_PRESSURE; break; case SNDRV_SEQ_EVENT_PITCHBEND: ossev.l.cmd = MIDI_PITCH_BEND; break; default: return 0; /* not supported */ } ossev.v.dev = dev; switch (ev->type) { case SNDRV_SEQ_EVENT_NOTEON: case SNDRV_SEQ_EVENT_NOTEOFF: case SNDRV_SEQ_EVENT_KEYPRESS: ossev.v.code = EV_CHN_VOICE; ossev.v.note = ev->data.note.note; ossev.v.parm = ev->data.note.velocity; ossev.v.chn = ev->data.note.channel; break; case SNDRV_SEQ_EVENT_CONTROLLER: case SNDRV_SEQ_EVENT_PGMCHANGE: case SNDRV_SEQ_EVENT_CHANPRESS: ossev.l.code = EV_CHN_COMMON; ossev.l.p1 = ev->data.control.param; ossev.l.val = ev->data.control.value; ossev.l.chn = ev->data.control.channel; break; case SNDRV_SEQ_EVENT_PITCHBEND: ossev.l.code = EV_CHN_COMMON; ossev.l.val = ev->data.control.value + 8192; ossev.l.chn = ev->data.control.channel; break; } snd_seq_oss_readq_put_timestamp(dp->readq, ev->time.tick, dp->seq_mode); snd_seq_oss_readq_put_event(dp->readq, &ossev); return 0; } /* * decode event and send MIDI bytes to read queue */ static int send_midi_event(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, struct seq_oss_midi *mdev) { char msg[32]; int len; snd_seq_oss_readq_put_timestamp(dp->readq, ev->time.tick, dp->seq_mode); if (!dp->timer->running) len = snd_seq_oss_timer_start(dp->timer); if (ev->type == SNDRV_SEQ_EVENT_SYSEX) { snd_seq_oss_readq_sysex(dp->readq, mdev->seq_device, ev); snd_midi_event_reset_decode(mdev->coder); } else { len = snd_midi_event_decode(mdev->coder, msg, sizeof(msg), ev); if (len > 0) snd_seq_oss_readq_puts(dp->readq, mdev->seq_device, msg, len); } return 0; } /* * dump midi data * return 0 : enqueued * non-zero : invalid - ignored */ int snd_seq_oss_midi_putc(struct seq_oss_devinfo *dp, int dev, unsigned char c, struct snd_seq_event *ev) { struct seq_oss_midi *mdev; mdev = get_mididev(dp, dev); if (!mdev) return -ENODEV; if (snd_midi_event_encode_byte(mdev->coder, c, ev)) { snd_seq_oss_fill_addr(dp, ev, mdev->client, mdev->port); snd_use_lock_free(&mdev->use_lock); return 0; } snd_use_lock_free(&mdev->use_lock); return -EINVAL; } /* * create OSS compatible midi_info record */ int snd_seq_oss_midi_make_info(struct seq_oss_devinfo *dp, int dev, struct midi_info *inf) { struct seq_oss_midi *mdev; mdev = get_mididev(dp, dev); if (!mdev) return -ENXIO; inf->device = dev; inf->dev_type = 0; /* FIXME: ?? */ inf->capabilities = 0; /* FIXME: ?? */ strscpy(inf->name, mdev->name, sizeof(inf->name)); snd_use_lock_free(&mdev->use_lock); return 0; } #ifdef CONFIG_SND_PROC_FS /* * proc interface */ static char * capmode_str(int val) { val &= PERM_READ|PERM_WRITE; if (val == (PERM_READ|PERM_WRITE)) return "read/write"; else if (val == PERM_READ) return "read"; else if (val == PERM_WRITE) return "write"; else return "none"; } void snd_seq_oss_midi_info_read(struct snd_info_buffer *buf) { int i; struct seq_oss_midi *mdev; snd_iprintf(buf, "\nNumber of MIDI devices: %d\n", max_midi_devs); for (i = 0; i < max_midi_devs; i++) { snd_iprintf(buf, "\nmidi %d: ", i); mdev = get_mdev(i); if (mdev == NULL) { snd_iprintf(buf, "*empty*\n"); continue; } snd_iprintf(buf, "[%s] ALSA port %d:%d\n", mdev->name, mdev->client, mdev->port); snd_iprintf(buf, " capability %s / opened %s\n", capmode_str(mdev->flags), capmode_str(mdev->opened)); snd_use_lock_free(&mdev->use_lock); } } #endif /* CONFIG_SND_PROC_FS */ |
4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 | /* * Copyright (c) 2016 Intel Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #include <drm/drm_atomic_helper.h> #include <drm/drm_fb_helper.h> #include <drm/drm_fourcc.h> #include <drm/drm_modeset_helper.h> #include <drm/drm_plane_helper.h> #include <drm/drm_print.h> #include <drm/drm_probe_helper.h> /** * DOC: aux kms helpers * * This helper library contains various one-off functions which don't really fit * anywhere else in the DRM modeset helper library. */ /** * drm_helper_move_panel_connectors_to_head() - move panels to the front in the * connector list * @dev: drm device to operate on * * Some userspace presumes that the first connected connector is the main * display, where it's supposed to display e.g. the login screen. For * laptops, this should be the main panel. Use this function to sort all * (eDP/LVDS/DSI) panels to the front of the connector list, instead of * painstakingly trying to initialize them in the right order. */ void drm_helper_move_panel_connectors_to_head(struct drm_device *dev) { struct drm_connector *connector, *tmp; struct list_head panel_list; INIT_LIST_HEAD(&panel_list); spin_lock_irq(&dev->mode_config.connector_list_lock); list_for_each_entry_safe(connector, tmp, &dev->mode_config.connector_list, head) { if (connector->connector_type == DRM_MODE_CONNECTOR_LVDS || connector->connector_type == DRM_MODE_CONNECTOR_eDP || connector->connector_type == DRM_MODE_CONNECTOR_DSI) list_move_tail(&connector->head, &panel_list); } list_splice(&panel_list, &dev->mode_config.connector_list); spin_unlock_irq(&dev->mode_config.connector_list_lock); } EXPORT_SYMBOL(drm_helper_move_panel_connectors_to_head); /** * drm_helper_mode_fill_fb_struct - fill out framebuffer metadata * @dev: DRM device * @fb: drm_framebuffer object to fill out * @mode_cmd: metadata from the userspace fb creation request * * This helper can be used in a drivers fb_create callback to pre-fill the fb's * metadata fields. */ void drm_helper_mode_fill_fb_struct(struct drm_device *dev, struct drm_framebuffer *fb, const struct drm_mode_fb_cmd2 *mode_cmd) { int i; fb->dev = dev; fb->format = drm_get_format_info(dev, mode_cmd); fb->width = mode_cmd->width; fb->height = mode_cmd->height; for (i = 0; i < 4; i++) { fb->pitches[i] = mode_cmd->pitches[i]; fb->offsets[i] = mode_cmd->offsets[i]; } fb->modifier = mode_cmd->modifier[0]; fb->flags = mode_cmd->flags; } EXPORT_SYMBOL(drm_helper_mode_fill_fb_struct); /* * This is the minimal list of formats that seem to be safe for modeset use * with all current DRM drivers. Most hardware can actually support more * formats than this and drivers may specify a more accurate list when * creating the primary plane. However drivers that still call * drm_plane_init() will use this minimal format list as the default. */ static const uint32_t safe_modeset_formats[] = { DRM_FORMAT_XRGB8888, DRM_FORMAT_ARGB8888, }; static struct drm_plane *create_primary_plane(struct drm_device *dev) { struct drm_plane *primary; int ret; primary = kzalloc(sizeof(*primary), GFP_KERNEL); if (primary == NULL) { DRM_DEBUG_KMS("Failed to allocate primary plane\n"); return NULL; } /* * Remove the format_default field from drm_plane when dropping * this helper. */ primary->format_default = true; /* possible_crtc's will be filled in later by crtc_init */ ret = drm_universal_plane_init(dev, primary, 0, &drm_primary_helper_funcs, safe_modeset_formats, ARRAY_SIZE(safe_modeset_formats), NULL, DRM_PLANE_TYPE_PRIMARY, NULL); if (ret) { kfree(primary); primary = NULL; } return primary; } /** * drm_crtc_init - Legacy CRTC initialization function * @dev: DRM device * @crtc: CRTC object to init * @funcs: callbacks for the new CRTC * * Initialize a CRTC object with a default helper-provided primary plane and no * cursor plane. * * Note that we make some assumptions about hardware limitations that may not be * true for all hardware: * * 1. Primary plane cannot be repositioned. * 2. Primary plane cannot be scaled. * 3. Primary plane must cover the entire CRTC. * 4. Subpixel positioning is not supported. * 5. The primary plane must always be on if the CRTC is enabled. * * This is purely a backwards compatibility helper for old drivers. Drivers * should instead implement their own primary plane. Atomic drivers must do so. * Drivers with the above hardware restriction can look into using &struct * drm_simple_display_pipe, which encapsulates the above limitations into a nice * interface. * * Returns: * Zero on success, error code on failure. */ int drm_crtc_init(struct drm_device *dev, struct drm_crtc *crtc, const struct drm_crtc_funcs *funcs) { struct drm_plane *primary; primary = create_primary_plane(dev); return drm_crtc_init_with_planes(dev, crtc, primary, NULL, funcs, NULL); } EXPORT_SYMBOL(drm_crtc_init); /** * drm_mode_config_helper_suspend - Modeset suspend helper * @dev: DRM device * * This helper function takes care of suspending the modeset side. It disables * output polling if initialized, suspends fbdev if used and finally calls * drm_atomic_helper_suspend(). * If suspending fails, fbdev and polling is re-enabled. * * Returns: * Zero on success, negative error code on error. * * See also: * drm_kms_helper_poll_disable() and drm_fb_helper_set_suspend_unlocked(). */ int drm_mode_config_helper_suspend(struct drm_device *dev) { struct drm_atomic_state *state; if (!dev) return 0; /* * Don't disable polling if it was never initialized */ if (dev->mode_config.poll_enabled) drm_kms_helper_poll_disable(dev); drm_fb_helper_set_suspend_unlocked(dev->fb_helper, 1); state = drm_atomic_helper_suspend(dev); if (IS_ERR(state)) { drm_fb_helper_set_suspend_unlocked(dev->fb_helper, 0); /* * Don't enable polling if it was never initialized */ if (dev->mode_config.poll_enabled) drm_kms_helper_poll_enable(dev); return PTR_ERR(state); } dev->mode_config.suspend_state = state; return 0; } EXPORT_SYMBOL(drm_mode_config_helper_suspend); /** * drm_mode_config_helper_resume - Modeset resume helper * @dev: DRM device * * This helper function takes care of resuming the modeset side. It calls * drm_atomic_helper_resume(), resumes fbdev if used and enables output polling * if initiaized. * * Returns: * Zero on success, negative error code on error. * * See also: * drm_fb_helper_set_suspend_unlocked() and drm_kms_helper_poll_enable(). */ int drm_mode_config_helper_resume(struct drm_device *dev) { int ret; if (!dev) return 0; if (WARN_ON(!dev->mode_config.suspend_state)) return -EINVAL; ret = drm_atomic_helper_resume(dev, dev->mode_config.suspend_state); if (ret) DRM_ERROR("Failed to resume (%d)\n", ret); dev->mode_config.suspend_state = NULL; drm_fb_helper_set_suspend_unlocked(dev->fb_helper, 0); /* * Don't enable polling if it is not initialized */ if (dev->mode_config.poll_enabled) drm_kms_helper_poll_enable(dev); return ret; } EXPORT_SYMBOL(drm_mode_config_helper_resume); |
6 6 4 53 4 51 1 86 85 1 1 85 85 266 267 267 267 267 267 267 265 266 266 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Advanced Linux Sound Architecture * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/init.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/device.h> #include <linux/module.h> #include <linux/debugfs.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/info.h> #include <sound/control.h> #include <sound/initval.h> #include <linux/kmod.h> #include <linux/mutex.h> static int major = CONFIG_SND_MAJOR; int snd_major; EXPORT_SYMBOL(snd_major); static int cards_limit = 1; MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>"); MODULE_DESCRIPTION("Advanced Linux Sound Architecture driver for soundcards."); MODULE_LICENSE("GPL"); module_param(major, int, 0444); MODULE_PARM_DESC(major, "Major # for sound driver."); module_param(cards_limit, int, 0444); MODULE_PARM_DESC(cards_limit, "Count of auto-loadable soundcards."); MODULE_ALIAS_CHARDEV_MAJOR(CONFIG_SND_MAJOR); /* this one holds the actual max. card number currently available. * as default, it's identical with cards_limit option. when more * modules are loaded manually, this limit number increases, too. */ int snd_ecards_limit; EXPORT_SYMBOL(snd_ecards_limit); #ifdef CONFIG_SND_DEBUG struct dentry *sound_debugfs_root; EXPORT_SYMBOL_GPL(sound_debugfs_root); #endif static struct snd_minor *snd_minors[SNDRV_OS_MINORS]; static DEFINE_MUTEX(sound_mutex); #ifdef CONFIG_MODULES /** * snd_request_card - try to load the card module * @card: the card number * * Tries to load the module "snd-card-X" for the given card number * via request_module. Returns immediately if already loaded. */ void snd_request_card(int card) { if (snd_card_locked(card)) return; if (card < 0 || card >= cards_limit) return; request_module("snd-card-%i", card); } EXPORT_SYMBOL(snd_request_card); static void snd_request_other(int minor) { char *str; switch (minor) { case SNDRV_MINOR_SEQUENCER: str = "snd-seq"; break; case SNDRV_MINOR_TIMER: str = "snd-timer"; break; default: return; } request_module(str); } #endif /* modular kernel */ /** * snd_lookup_minor_data - get user data of a registered device * @minor: the minor number * @type: device type (SNDRV_DEVICE_TYPE_XXX) * * Checks that a minor device with the specified type is registered, and returns * its user data pointer. * * This function increments the reference counter of the card instance * if an associated instance with the given minor number and type is found. * The caller must call snd_card_unref() appropriately later. * * Return: The user data pointer if the specified device is found. %NULL * otherwise. */ void *snd_lookup_minor_data(unsigned int minor, int type) { struct snd_minor *mreg; void *private_data; if (minor >= ARRAY_SIZE(snd_minors)) return NULL; mutex_lock(&sound_mutex); mreg = snd_minors[minor]; if (mreg && mreg->type == type) { private_data = mreg->private_data; if (private_data && mreg->card_ptr) get_device(&mreg->card_ptr->card_dev); } else private_data = NULL; mutex_unlock(&sound_mutex); return private_data; } EXPORT_SYMBOL(snd_lookup_minor_data); #ifdef CONFIG_MODULES static struct snd_minor *autoload_device(unsigned int minor) { int dev; mutex_unlock(&sound_mutex); /* release lock temporarily */ dev = SNDRV_MINOR_DEVICE(minor); if (dev == SNDRV_MINOR_CONTROL) { /* /dev/aloadC? */ int card = SNDRV_MINOR_CARD(minor); struct snd_card *ref = snd_card_ref(card); if (!ref) snd_request_card(card); else snd_card_unref(ref); } else if (dev == SNDRV_MINOR_GLOBAL) { /* /dev/aloadSEQ */ snd_request_other(minor); } mutex_lock(&sound_mutex); /* reacuire lock */ return snd_minors[minor]; } #else /* !CONFIG_MODULES */ #define autoload_device(minor) NULL #endif /* CONFIG_MODULES */ static int snd_open(struct inode *inode, struct file *file) { unsigned int minor = iminor(inode); struct snd_minor *mptr = NULL; const struct file_operations *new_fops; int err = 0; if (minor >= ARRAY_SIZE(snd_minors)) return -ENODEV; mutex_lock(&sound_mutex); mptr = snd_minors[minor]; if (mptr == NULL) { mptr = autoload_device(minor); if (!mptr) { mutex_unlock(&sound_mutex); return -ENODEV; } } new_fops = fops_get(mptr->f_ops); mutex_unlock(&sound_mutex); if (!new_fops) return -ENODEV; replace_fops(file, new_fops); if (file->f_op->open) err = file->f_op->open(inode, file); return err; } static const struct file_operations snd_fops = { .owner = THIS_MODULE, .open = snd_open, .llseek = noop_llseek, }; #ifdef CONFIG_SND_DYNAMIC_MINORS static int snd_find_free_minor(int type, struct snd_card *card, int dev) { int minor; /* static minors for module auto loading */ if (type == SNDRV_DEVICE_TYPE_SEQUENCER) return SNDRV_MINOR_SEQUENCER; if (type == SNDRV_DEVICE_TYPE_TIMER) return SNDRV_MINOR_TIMER; for (minor = 0; minor < ARRAY_SIZE(snd_minors); ++minor) { /* skip static minors still used for module auto loading */ if (SNDRV_MINOR_DEVICE(minor) == SNDRV_MINOR_CONTROL) continue; if (minor == SNDRV_MINOR_SEQUENCER || minor == SNDRV_MINOR_TIMER) continue; if (!snd_minors[minor]) return minor; } return -EBUSY; } #else static int snd_find_free_minor(int type, struct snd_card *card, int dev) { int minor; switch (type) { case SNDRV_DEVICE_TYPE_SEQUENCER: case SNDRV_DEVICE_TYPE_TIMER: minor = type; break; case SNDRV_DEVICE_TYPE_CONTROL: if (snd_BUG_ON(!card)) return -EINVAL; minor = SNDRV_MINOR(card->number, type); break; case SNDRV_DEVICE_TYPE_HWDEP: case SNDRV_DEVICE_TYPE_RAWMIDI: case SNDRV_DEVICE_TYPE_PCM_PLAYBACK: case SNDRV_DEVICE_TYPE_PCM_CAPTURE: case SNDRV_DEVICE_TYPE_COMPRESS: if (snd_BUG_ON(!card)) return -EINVAL; minor = SNDRV_MINOR(card->number, type + dev); break; default: return -EINVAL; } if (snd_BUG_ON(minor < 0 || minor >= SNDRV_OS_MINORS)) return -EINVAL; if (snd_minors[minor]) return -EBUSY; return minor; } #endif /** * snd_register_device - Register the ALSA device file for the card * @type: the device type, SNDRV_DEVICE_TYPE_XXX * @card: the card instance * @dev: the device index * @f_ops: the file operations * @private_data: user pointer for f_ops->open() * @device: the device to register * * Registers an ALSA device file for the given card. * The operators have to be set in reg parameter. * * Return: Zero if successful, or a negative error code on failure. */ int snd_register_device(int type, struct snd_card *card, int dev, const struct file_operations *f_ops, void *private_data, struct device *device) { int minor; int err = 0; struct snd_minor *preg; if (snd_BUG_ON(!device)) return -EINVAL; preg = kmalloc(sizeof *preg, GFP_KERNEL); if (preg == NULL) return -ENOMEM; preg->type = type; preg->card = card ? card->number : -1; preg->device = dev; preg->f_ops = f_ops; preg->private_data = private_data; preg->card_ptr = card; mutex_lock(&sound_mutex); minor = snd_find_free_minor(type, card, dev); if (minor < 0) { err = minor; goto error; } preg->dev = device; device->devt = MKDEV(major, minor); err = device_add(device); if (err < 0) goto error; snd_minors[minor] = preg; error: mutex_unlock(&sound_mutex); if (err < 0) kfree(preg); return err; } EXPORT_SYMBOL(snd_register_device); /** * snd_unregister_device - unregister the device on the given card * @dev: the device instance * * Unregisters the device file already registered via * snd_register_device(). * * Return: Zero if successful, or a negative error code on failure. */ int snd_unregister_device(struct device *dev) { int minor; struct snd_minor *preg; mutex_lock(&sound_mutex); for (minor = 0; minor < ARRAY_SIZE(snd_minors); ++minor) { preg = snd_minors[minor]; if (preg && preg->dev == dev) { snd_minors[minor] = NULL; device_del(dev); kfree(preg); break; } } mutex_unlock(&sound_mutex); if (minor >= ARRAY_SIZE(snd_minors)) return -ENOENT; return 0; } EXPORT_SYMBOL(snd_unregister_device); #ifdef CONFIG_SND_PROC_FS /* * INFO PART */ static const char *snd_device_type_name(int type) { switch (type) { case SNDRV_DEVICE_TYPE_CONTROL: return "control"; case SNDRV_DEVICE_TYPE_HWDEP: return "hardware dependent"; case SNDRV_DEVICE_TYPE_RAWMIDI: return "raw midi"; case SNDRV_DEVICE_TYPE_PCM_PLAYBACK: return "digital audio playback"; case SNDRV_DEVICE_TYPE_PCM_CAPTURE: return "digital audio capture"; case SNDRV_DEVICE_TYPE_SEQUENCER: return "sequencer"; case SNDRV_DEVICE_TYPE_TIMER: return "timer"; case SNDRV_DEVICE_TYPE_COMPRESS: return "compress"; default: return "?"; } } static void snd_minor_info_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { int minor; struct snd_minor *mptr; mutex_lock(&sound_mutex); for (minor = 0; minor < SNDRV_OS_MINORS; ++minor) { mptr = snd_minors[minor]; if (!mptr) continue; if (mptr->card >= 0) { if (mptr->device >= 0) snd_iprintf(buffer, "%3i: [%2i-%2i]: %s\n", minor, mptr->card, mptr->device, snd_device_type_name(mptr->type)); else snd_iprintf(buffer, "%3i: [%2i] : %s\n", minor, mptr->card, snd_device_type_name(mptr->type)); } else snd_iprintf(buffer, "%3i: : %s\n", minor, snd_device_type_name(mptr->type)); } mutex_unlock(&sound_mutex); } int __init snd_minor_info_init(void) { struct snd_info_entry *entry; entry = snd_info_create_module_entry(THIS_MODULE, "devices", NULL); if (!entry) return -ENOMEM; entry->c.text.read = snd_minor_info_read; return snd_info_register(entry); /* freed in error path */ } #endif /* CONFIG_SND_PROC_FS */ /* * INIT PART */ static int __init alsa_sound_init(void) { snd_major = major; snd_ecards_limit = cards_limit; if (register_chrdev(major, "alsa", &snd_fops)) { pr_err("ALSA core: unable to register native major device number %d\n", major); return -EIO; } if (snd_info_init() < 0) { unregister_chrdev(major, "alsa"); return -ENOMEM; } #ifdef CONFIG_SND_DEBUG sound_debugfs_root = debugfs_create_dir("sound", NULL); #endif #ifndef MODULE pr_info("Advanced Linux Sound Architecture Driver Initialized.\n"); #endif return 0; } static void __exit alsa_sound_exit(void) { #ifdef CONFIG_SND_DEBUG debugfs_remove(sound_debugfs_root); #endif snd_info_done(); unregister_chrdev(major, "alsa"); } subsys_initcall(alsa_sound_init); module_exit(alsa_sound_exit); |
2 1 1 1 1 1 1 2 2 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 | // SPDX-License-Identifier: GPL-2.0-or-later /* * SQ905 subdriver * * Copyright (C) 2008, 2009 Adam Baker and Theodore Kilgore */ /* * History and Acknowledgments * * The original Linux driver for SQ905 based cameras was written by * Marcell Lengyel and further developed by many other contributors * and is available from http://sourceforge.net/projects/sqcam/ * * This driver takes advantage of the reverse engineering work done for * that driver and for libgphoto2 but shares no code with them. * * This driver has used as a base the finepix driver and other gspca * based drivers and may still contain code fragments taken from those * drivers. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define MODULE_NAME "sq905" #include <linux/workqueue.h> #include <linux/slab.h> #include "gspca.h" MODULE_AUTHOR("Adam Baker <linux@baker-net.org.uk>, Theodore Kilgore <kilgota@auburn.edu>"); MODULE_DESCRIPTION("GSPCA/SQ905 USB Camera Driver"); MODULE_LICENSE("GPL"); /* Default timeouts, in ms */ #define SQ905_CMD_TIMEOUT 500 #define SQ905_DATA_TIMEOUT 1000 /* Maximum transfer size to use. */ #define SQ905_MAX_TRANSFER 0x8000 #define FRAME_HEADER_LEN 64 /* The known modes, or registers. These go in the "value" slot. */ /* 00 is "none" obviously */ #define SQ905_BULK_READ 0x03 /* precedes any bulk read */ #define SQ905_COMMAND 0x06 /* precedes the command codes below */ #define SQ905_PING 0x07 /* when reading an "idling" command */ #define SQ905_READ_DONE 0xc0 /* ack bulk read completed */ /* Any non-zero value in the bottom 2 bits of the 2nd byte of * the ID appears to indicate the camera can do 640*480. If the * LSB of that byte is set the image is just upside down, otherwise * it is rotated 180 degrees. */ #define SQ905_HIRES_MASK 0x00000300 #define SQ905_ORIENTATION_MASK 0x00000100 /* Some command codes. These go in the "index" slot. */ #define SQ905_ID 0xf0 /* asks for model string */ #define SQ905_CONFIG 0x20 /* gets photo alloc. table, not used here */ #define SQ905_DATA 0x30 /* accesses photo data, not used here */ #define SQ905_CLEAR 0xa0 /* clear everything */ #define SQ905_CAPTURE_LOW 0x60 /* Starts capture at 160x120 */ #define SQ905_CAPTURE_MED 0x61 /* Starts capture at 320x240 */ #define SQ905_CAPTURE_HIGH 0x62 /* Starts capture at 640x480 (some cams only) */ /* note that the capture command also controls the output dimensions */ /* Structure to hold all of our device specific stuff */ struct sd { struct gspca_dev gspca_dev; /* !! must be the first item */ /* * Driver stuff */ struct work_struct work_struct; struct workqueue_struct *work_thread; }; static struct v4l2_pix_format sq905_mode[] = { { 160, 120, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .bytesperline = 160, .sizeimage = 160 * 120, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 0}, { 320, 240, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .bytesperline = 320, .sizeimage = 320 * 240, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 0}, { 640, 480, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .bytesperline = 640, .sizeimage = 640 * 480, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 0} }; /* * Send a command to the camera. */ static int sq905_command(struct gspca_dev *gspca_dev, u16 index) { int ret; gspca_dev->usb_buf[0] = '\0'; ret = usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), USB_REQ_SYNCH_FRAME, /* request */ USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, SQ905_COMMAND, index, gspca_dev->usb_buf, 1, SQ905_CMD_TIMEOUT); if (ret < 0) { pr_err("%s: usb_control_msg failed (%d)\n", __func__, ret); return ret; } ret = usb_control_msg(gspca_dev->dev, usb_rcvctrlpipe(gspca_dev->dev, 0), USB_REQ_SYNCH_FRAME, /* request */ USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, SQ905_PING, 0, gspca_dev->usb_buf, 1, SQ905_CMD_TIMEOUT); if (ret < 0) { pr_err("%s: usb_control_msg failed 2 (%d)\n", __func__, ret); return ret; } return 0; } /* * Acknowledge the end of a frame - see warning on sq905_command. */ static int sq905_ack_frame(struct gspca_dev *gspca_dev) { int ret; gspca_dev->usb_buf[0] = '\0'; ret = usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), USB_REQ_SYNCH_FRAME, /* request */ USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, SQ905_READ_DONE, 0, gspca_dev->usb_buf, 1, SQ905_CMD_TIMEOUT); if (ret < 0) { pr_err("%s: usb_control_msg failed (%d)\n", __func__, ret); return ret; } return 0; } /* * request and read a block of data - see warning on sq905_command. */ static int sq905_read_data(struct gspca_dev *gspca_dev, u8 *data, int size, int need_lock) { int ret; int act_len = 0; gspca_dev->usb_buf[0] = '\0'; if (need_lock) mutex_lock(&gspca_dev->usb_lock); ret = usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), USB_REQ_SYNCH_FRAME, /* request */ USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, SQ905_BULK_READ, size, gspca_dev->usb_buf, 1, SQ905_CMD_TIMEOUT); if (need_lock) mutex_unlock(&gspca_dev->usb_lock); if (ret < 0) { pr_err("%s: usb_control_msg failed (%d)\n", __func__, ret); return ret; } ret = usb_bulk_msg(gspca_dev->dev, usb_rcvbulkpipe(gspca_dev->dev, 0x81), data, size, &act_len, SQ905_DATA_TIMEOUT); /* successful, it returns 0, otherwise negative */ if (ret < 0 || act_len != size) { pr_err("bulk read fail (%d) len %d/%d\n", ret, act_len, size); return -EIO; } return 0; } /* * This function is called as a workqueue function and runs whenever the camera * is streaming data. Because it is a workqueue function it is allowed to sleep * so we can use synchronous USB calls. To avoid possible collisions with other * threads attempting to use gspca_dev->usb_buf we take the usb_lock when * performing USB operations using it. In practice we don't really need this * as the camera doesn't provide any controls. */ static void sq905_dostream(struct work_struct *work) { struct sd *dev = container_of(work, struct sd, work_struct); struct gspca_dev *gspca_dev = &dev->gspca_dev; int bytes_left; /* bytes remaining in current frame. */ int data_len; /* size to use for the next read. */ int header_read; /* true if we have already read the frame header. */ int packet_type; int frame_sz; int ret; u8 *data; u8 *buffer; buffer = kmalloc(SQ905_MAX_TRANSFER, GFP_KERNEL); if (!buffer) { pr_err("Couldn't allocate USB buffer\n"); goto quit_stream; } frame_sz = gspca_dev->cam.cam_mode[gspca_dev->curr_mode].sizeimage + FRAME_HEADER_LEN; while (gspca_dev->present && gspca_dev->streaming) { #ifdef CONFIG_PM if (gspca_dev->frozen) break; #endif /* request some data and then read it until we have * a complete frame. */ bytes_left = frame_sz; header_read = 0; /* Note we do not check for gspca_dev->streaming here, as we must finish reading an entire frame, otherwise the next time we stream we start reading in the middle of a frame. */ while (bytes_left > 0 && gspca_dev->present) { data_len = bytes_left > SQ905_MAX_TRANSFER ? SQ905_MAX_TRANSFER : bytes_left; ret = sq905_read_data(gspca_dev, buffer, data_len, 1); if (ret < 0) goto quit_stream; gspca_dbg(gspca_dev, D_PACK, "Got %d bytes out of %d for frame\n", data_len, bytes_left); bytes_left -= data_len; data = buffer; if (!header_read) { packet_type = FIRST_PACKET; /* The first 64 bytes of each frame are * a header full of FF 00 bytes */ data += FRAME_HEADER_LEN; data_len -= FRAME_HEADER_LEN; header_read = 1; } else if (bytes_left == 0) { packet_type = LAST_PACKET; } else { packet_type = INTER_PACKET; } gspca_frame_add(gspca_dev, packet_type, data, data_len); /* If entire frame fits in one packet we still need to add a LAST_PACKET */ if (packet_type == FIRST_PACKET && bytes_left == 0) gspca_frame_add(gspca_dev, LAST_PACKET, NULL, 0); } if (gspca_dev->present) { /* acknowledge the frame */ mutex_lock(&gspca_dev->usb_lock); ret = sq905_ack_frame(gspca_dev); mutex_unlock(&gspca_dev->usb_lock); if (ret < 0) goto quit_stream; } } quit_stream: if (gspca_dev->present) { mutex_lock(&gspca_dev->usb_lock); sq905_command(gspca_dev, SQ905_CLEAR); mutex_unlock(&gspca_dev->usb_lock); } kfree(buffer); } /* This function is called at probe time just before sd_init */ static int sd_config(struct gspca_dev *gspca_dev, const struct usb_device_id *id) { struct cam *cam = &gspca_dev->cam; struct sd *dev = (struct sd *) gspca_dev; /* We don't use the buffer gspca allocates so make it small. */ cam->bulk = 1; cam->bulk_size = 64; INIT_WORK(&dev->work_struct, sq905_dostream); return 0; } /* called on streamoff with alt==0 and on disconnect */ /* the usb_lock is held at entry - restore on exit */ static void sd_stop0(struct gspca_dev *gspca_dev) { struct sd *dev = (struct sd *) gspca_dev; /* wait for the work queue to terminate */ mutex_unlock(&gspca_dev->usb_lock); /* This waits for sq905_dostream to finish */ destroy_workqueue(dev->work_thread); dev->work_thread = NULL; mutex_lock(&gspca_dev->usb_lock); } /* this function is called at probe and resume time */ static int sd_init(struct gspca_dev *gspca_dev) { u32 ident; int ret; /* connect to the camera and read * the model ID and process that and put it away. */ ret = sq905_command(gspca_dev, SQ905_CLEAR); if (ret < 0) return ret; ret = sq905_command(gspca_dev, SQ905_ID); if (ret < 0) return ret; ret = sq905_read_data(gspca_dev, gspca_dev->usb_buf, 4, 0); if (ret < 0) return ret; /* usb_buf is allocated with kmalloc so is aligned. * Camera model number is the right way round if we assume this * reverse engineered ID is supposed to be big endian. */ ident = be32_to_cpup((__be32 *)gspca_dev->usb_buf); ret = sq905_command(gspca_dev, SQ905_CLEAR); if (ret < 0) return ret; gspca_dbg(gspca_dev, D_CONF, "SQ905 camera ID %08x detected\n", ident); gspca_dev->cam.cam_mode = sq905_mode; gspca_dev->cam.nmodes = ARRAY_SIZE(sq905_mode); if (!(ident & SQ905_HIRES_MASK)) gspca_dev->cam.nmodes--; if (ident & SQ905_ORIENTATION_MASK) gspca_dev->cam.input_flags = V4L2_IN_ST_VFLIP; else gspca_dev->cam.input_flags = V4L2_IN_ST_VFLIP | V4L2_IN_ST_HFLIP; return 0; } /* Set up for getting frames. */ static int sd_start(struct gspca_dev *gspca_dev) { struct sd *dev = (struct sd *) gspca_dev; int ret; /* "Open the shutter" and set size, to start capture */ switch (gspca_dev->curr_mode) { default: /* case 2: */ gspca_dbg(gspca_dev, D_STREAM, "Start streaming at high resolution\n"); ret = sq905_command(&dev->gspca_dev, SQ905_CAPTURE_HIGH); break; case 1: gspca_dbg(gspca_dev, D_STREAM, "Start streaming at medium resolution\n"); ret = sq905_command(&dev->gspca_dev, SQ905_CAPTURE_MED); break; case 0: gspca_dbg(gspca_dev, D_STREAM, "Start streaming at low resolution\n"); ret = sq905_command(&dev->gspca_dev, SQ905_CAPTURE_LOW); } if (ret < 0) { gspca_err(gspca_dev, "Start streaming command failed\n"); return ret; } /* Start the workqueue function to do the streaming */ dev->work_thread = create_singlethread_workqueue(MODULE_NAME); if (!dev->work_thread) return -ENOMEM; queue_work(dev->work_thread, &dev->work_struct); return 0; } /* Table of supported USB devices */ static const struct usb_device_id device_table[] = { {USB_DEVICE(0x2770, 0x9120)}, {} }; MODULE_DEVICE_TABLE(usb, device_table); /* sub-driver description */ static const struct sd_desc sd_desc = { .name = MODULE_NAME, .config = sd_config, .init = sd_init, .start = sd_start, .stop0 = sd_stop0, }; /* -- device connect -- */ static int sd_probe(struct usb_interface *intf, const struct usb_device_id *id) { return gspca_dev_probe(intf, id, &sd_desc, sizeof(struct sd), THIS_MODULE); } static struct usb_driver sd_driver = { .name = MODULE_NAME, .id_table = device_table, .probe = sd_probe, .disconnect = gspca_disconnect, #ifdef CONFIG_PM .suspend = gspca_suspend, .resume = gspca_resume, .reset_resume = gspca_resume, #endif }; module_usb_driver(sd_driver); |
14 9 9 14 14 9 15 15 9 14 14 14 15 15 15 3 3 3 3 3 89 89 12 12 10 10 10 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 | // SPDX-License-Identifier: GPL-2.0-only /* * count the number of connections matching an arbitrary key. * * (C) 2017 Red Hat GmbH * Author: Florian Westphal <fw@strlen.de> * * split from xt_connlimit.c: * (c) 2000 Gerd Knorr <kraxel@bytesex.org> * Nov 2002: Martin Bene <martin.bene@icomedias.com>: * only ignore TIME_WAIT or gone connections * (C) CC Computer Consultants GmbH, 2007 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/in.h> #include <linux/in6.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/jhash.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/rbtree.h> #include <linux/module.h> #include <linux/random.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/netfilter/nf_conntrack_tcp.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_count.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> #define CONNCOUNT_SLOTS 256U #define CONNCOUNT_GC_MAX_NODES 8 #define MAX_KEYLEN 5 /* we will save the tuples of all connections we care about */ struct nf_conncount_tuple { struct list_head node; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; int cpu; u32 jiffies32; }; struct nf_conncount_rb { struct rb_node node; struct nf_conncount_list list; u32 key[MAX_KEYLEN]; struct rcu_head rcu_head; }; static spinlock_t nf_conncount_locks[CONNCOUNT_SLOTS] __cacheline_aligned_in_smp; struct nf_conncount_data { unsigned int keylen; struct rb_root root[CONNCOUNT_SLOTS]; struct net *net; struct work_struct gc_work; unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)]; unsigned int gc_tree; }; static u_int32_t conncount_rnd __read_mostly; static struct kmem_cache *conncount_rb_cachep __read_mostly; static struct kmem_cache *conncount_conn_cachep __read_mostly; static inline bool already_closed(const struct nf_conn *conn) { if (nf_ct_protonum(conn) == IPPROTO_TCP) return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT || conn->proto.tcp.state == TCP_CONNTRACK_CLOSE; else return false; } static int key_diff(const u32 *a, const u32 *b, unsigned int klen) { return memcmp(a, b, klen * sizeof(u32)); } static void conn_free(struct nf_conncount_list *list, struct nf_conncount_tuple *conn) { lockdep_assert_held(&list->list_lock); list->count--; list_del(&conn->node); kmem_cache_free(conncount_conn_cachep, conn); } static const struct nf_conntrack_tuple_hash * find_or_evict(struct net *net, struct nf_conncount_list *list, struct nf_conncount_tuple *conn) { const struct nf_conntrack_tuple_hash *found; unsigned long a, b; int cpu = raw_smp_processor_id(); u32 age; found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple); if (found) return found; b = conn->jiffies32; a = (u32)jiffies; /* conn might have been added just before by another cpu and * might still be unconfirmed. In this case, nf_conntrack_find() * returns no result. Thus only evict if this cpu added the * stale entry or if the entry is older than two jiffies. */ age = a - b; if (conn->cpu == cpu || age >= 2) { conn_free(list, conn); return ERR_PTR(-ENOENT); } return ERR_PTR(-EAGAIN); } static int __nf_conncount_add(struct net *net, struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collect = 0; /* check the saved connections */ list_for_each_entry_safe(conn, conn_n, &list->head, node) { if (collect > CONNCOUNT_GC_MAX_NODES) break; found = find_or_evict(net, list, conn); if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { if (nf_ct_tuple_equal(&conn->tuple, tuple) && nf_ct_zone_id(&conn->zone, conn->zone.dir) == nf_ct_zone_id(zone, zone->dir)) return 0; /* already exists */ } else { collect++; } continue; } found_ct = nf_ct_tuplehash_to_ctrack(found); if (nf_ct_tuple_equal(&conn->tuple, tuple) && nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* * We should not see tuples twice unless someone hooks * this into a table without "-p tcp --syn". * * Attempt to avoid a re-add in this case. */ nf_ct_put(found_ct); return 0; } else if (already_closed(found_ct)) { /* * we do not care about connections which are * closed already -> ditch it */ nf_ct_put(found_ct); conn_free(list, conn); collect++; continue; } nf_ct_put(found_ct); } if (WARN_ON_ONCE(list->count > INT_MAX)) return -EOVERFLOW; conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); if (conn == NULL) return -ENOMEM; conn->tuple = *tuple; conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; list_add_tail(&conn->node, &list->head); list->count++; return 0; } int nf_conncount_add(struct net *net, struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { int ret; /* check the saved connections */ spin_lock_bh(&list->list_lock); ret = __nf_conncount_add(net, list, tuple, zone); spin_unlock_bh(&list->list_lock); return ret; } EXPORT_SYMBOL_GPL(nf_conncount_add); void nf_conncount_list_init(struct nf_conncount_list *list) { spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); list->count = 0; } EXPORT_SYMBOL_GPL(nf_conncount_list_init); /* Return true if the list is empty. Must be called with BH disabled. */ bool nf_conncount_gc_list(struct net *net, struct nf_conncount_list *list) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collected = 0; bool ret = false; /* don't bother if other cpu is already doing GC */ if (!spin_trylock(&list->list_lock)) return false; list_for_each_entry_safe(conn, conn_n, &list->head, node) { found = find_or_evict(net, list, conn); if (IS_ERR(found)) { if (PTR_ERR(found) == -ENOENT) collected++; continue; } found_ct = nf_ct_tuplehash_to_ctrack(found); if (already_closed(found_ct)) { /* * we do not care about connections which are * closed already -> ditch it */ nf_ct_put(found_ct); conn_free(list, conn); collected++; continue; } nf_ct_put(found_ct); if (collected > CONNCOUNT_GC_MAX_NODES) break; } if (!list->count) ret = true; spin_unlock(&list->list_lock); return ret; } EXPORT_SYMBOL_GPL(nf_conncount_gc_list); static void __tree_nodes_free(struct rcu_head *h) { struct nf_conncount_rb *rbconn; rbconn = container_of(h, struct nf_conncount_rb, rcu_head); kmem_cache_free(conncount_rb_cachep, rbconn); } /* caller must hold tree nf_conncount_locks[] lock */ static void tree_nodes_free(struct rb_root *root, struct nf_conncount_rb *gc_nodes[], unsigned int gc_count) { struct nf_conncount_rb *rbconn; while (gc_count) { rbconn = gc_nodes[--gc_count]; spin_lock(&rbconn->list.list_lock); if (!rbconn->list.count) { rb_erase(&rbconn->node, root); call_rcu(&rbconn->rcu_head, __tree_nodes_free); } spin_unlock(&rbconn->list.list_lock); } } static void schedule_gc_worker(struct nf_conncount_data *data, int tree) { set_bit(tree, data->pending_trees); schedule_work(&data->gc_work); } static unsigned int insert_tree(struct net *net, struct nf_conncount_data *data, struct rb_root *root, unsigned int hash, const u32 *key, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; struct rb_node **rbnode, *parent; struct nf_conncount_rb *rbconn; struct nf_conncount_tuple *conn; unsigned int count = 0, gc_count = 0; bool do_gc = true; spin_lock_bh(&nf_conncount_locks[hash]); restart: parent = NULL; rbnode = &(root->rb_node); while (*rbnode) { int diff; rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); parent = *rbnode; diff = key_diff(key, rbconn->key, data->keylen); if (diff < 0) { rbnode = &((*rbnode)->rb_left); } else if (diff > 0) { rbnode = &((*rbnode)->rb_right); } else { int ret; ret = nf_conncount_add(net, &rbconn->list, tuple, zone); if (ret) count = 0; /* hotdrop */ else count = rbconn->list.count; tree_nodes_free(root, gc_nodes, gc_count); goto out_unlock; } if (gc_count >= ARRAY_SIZE(gc_nodes)) continue; if (do_gc && nf_conncount_gc_list(net, &rbconn->list)) gc_nodes[gc_count++] = rbconn; } if (gc_count) { tree_nodes_free(root, gc_nodes, gc_count); schedule_gc_worker(data, hash); gc_count = 0; do_gc = false; goto restart; } /* expected case: match, insert new node */ rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); if (rbconn == NULL) goto out_unlock; conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); if (conn == NULL) { kmem_cache_free(conncount_rb_cachep, rbconn); goto out_unlock; } conn->tuple = *tuple; conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; memcpy(rbconn->key, key, sizeof(u32) * data->keylen); nf_conncount_list_init(&rbconn->list); list_add(&conn->node, &rbconn->list.head); count = 1; rbconn->list.count = count; rb_link_node_rcu(&rbconn->node, parent, rbnode); rb_insert_color(&rbconn->node, root); out_unlock: spin_unlock_bh(&nf_conncount_locks[hash]); return count; } static unsigned int count_tree(struct net *net, struct nf_conncount_data *data, const u32 *key, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { struct rb_root *root; struct rb_node *parent; struct nf_conncount_rb *rbconn; unsigned int hash; hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; root = &data->root[hash]; parent = rcu_dereference_raw(root->rb_node); while (parent) { int diff; rbconn = rb_entry(parent, struct nf_conncount_rb, node); diff = key_diff(key, rbconn->key, data->keylen); if (diff < 0) { parent = rcu_dereference_raw(parent->rb_left); } else if (diff > 0) { parent = rcu_dereference_raw(parent->rb_right); } else { int ret; if (!tuple) { nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; } spin_lock_bh(&rbconn->list.list_lock); /* Node might be about to be free'd. * We need to defer to insert_tree() in this case. */ if (rbconn->list.count == 0) { spin_unlock_bh(&rbconn->list.list_lock); break; } /* same source network -> be counted! */ ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); spin_unlock_bh(&rbconn->list.list_lock); if (ret) return 0; /* hotdrop */ else return rbconn->list.count; } } if (!tuple) return 0; return insert_tree(net, data, root, hash, key, tuple, zone); } static void tree_gc_worker(struct work_struct *work) { struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work); struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn; struct rb_root *root; struct rb_node *node; unsigned int tree, next_tree, gc_count = 0; tree = data->gc_tree % CONNCOUNT_SLOTS; root = &data->root[tree]; local_bh_disable(); rcu_read_lock(); for (node = rb_first(root); node != NULL; node = rb_next(node)) { rbconn = rb_entry(node, struct nf_conncount_rb, node); if (nf_conncount_gc_list(data->net, &rbconn->list)) gc_count++; } rcu_read_unlock(); local_bh_enable(); cond_resched(); spin_lock_bh(&nf_conncount_locks[tree]); if (gc_count < ARRAY_SIZE(gc_nodes)) goto next; /* do not bother */ gc_count = 0; node = rb_first(root); while (node != NULL) { rbconn = rb_entry(node, struct nf_conncount_rb, node); node = rb_next(node); if (rbconn->list.count > 0) continue; gc_nodes[gc_count++] = rbconn; if (gc_count >= ARRAY_SIZE(gc_nodes)) { tree_nodes_free(root, gc_nodes, gc_count); gc_count = 0; } } tree_nodes_free(root, gc_nodes, gc_count); next: clear_bit(tree, data->pending_trees); next_tree = (tree + 1) % CONNCOUNT_SLOTS; next_tree = find_next_bit(data->pending_trees, CONNCOUNT_SLOTS, next_tree); if (next_tree < CONNCOUNT_SLOTS) { data->gc_tree = next_tree; schedule_work(work); } spin_unlock_bh(&nf_conncount_locks[tree]); } /* Count and return number of conntrack entries in 'net' with particular 'key'. * If 'tuple' is not null, insert it into the accounting data structure. * Call with RCU read lock. */ unsigned int nf_conncount_count(struct net *net, struct nf_conncount_data *data, const u32 *key, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { return count_tree(net, data, key, tuple, zone); } EXPORT_SYMBOL_GPL(nf_conncount_count); struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, unsigned int keylen) { struct nf_conncount_data *data; int ret, i; if (keylen % sizeof(u32) || keylen / sizeof(u32) > MAX_KEYLEN || keylen == 0) return ERR_PTR(-EINVAL); net_get_random_once(&conncount_rnd, sizeof(conncount_rnd)); data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); ret = nf_ct_netns_get(net, family); if (ret < 0) { kfree(data); return ERR_PTR(ret); } for (i = 0; i < ARRAY_SIZE(data->root); ++i) data->root[i] = RB_ROOT; data->keylen = keylen / sizeof(u32); data->net = net; INIT_WORK(&data->gc_work, tree_gc_worker); return data; } EXPORT_SYMBOL_GPL(nf_conncount_init); void nf_conncount_cache_free(struct nf_conncount_list *list) { struct nf_conncount_tuple *conn, *conn_n; list_for_each_entry_safe(conn, conn_n, &list->head, node) kmem_cache_free(conncount_conn_cachep, conn); } EXPORT_SYMBOL_GPL(nf_conncount_cache_free); static void destroy_tree(struct rb_root *r) { struct nf_conncount_rb *rbconn; struct rb_node *node; while ((node = rb_first(r)) != NULL) { rbconn = rb_entry(node, struct nf_conncount_rb, node); rb_erase(node, r); nf_conncount_cache_free(&rbconn->list); kmem_cache_free(conncount_rb_cachep, rbconn); } } void nf_conncount_destroy(struct net *net, unsigned int family, struct nf_conncount_data *data) { unsigned int i; cancel_work_sync(&data->gc_work); nf_ct_netns_put(net, family); for (i = 0; i < ARRAY_SIZE(data->root); ++i) destroy_tree(&data->root[i]); kfree(data); } EXPORT_SYMBOL_GPL(nf_conncount_destroy); static int __init nf_conncount_modinit(void) { int i; for (i = 0; i < CONNCOUNT_SLOTS; ++i) spin_lock_init(&nf_conncount_locks[i]); conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple", sizeof(struct nf_conncount_tuple), 0, 0, NULL); if (!conncount_conn_cachep) return -ENOMEM; conncount_rb_cachep = kmem_cache_create("nf_conncount_rb", sizeof(struct nf_conncount_rb), 0, 0, NULL); if (!conncount_rb_cachep) { kmem_cache_destroy(conncount_conn_cachep); return -ENOMEM; } return 0; } static void __exit nf_conncount_modexit(void) { kmem_cache_destroy(conncount_conn_cachep); kmem_cache_destroy(conncount_rb_cachep); } module_init(nf_conncount_modinit); module_exit(nf_conncount_modexit); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_DESCRIPTION("netfilter: count number of connections matching a key"); MODULE_LICENSE("GPL"); |
2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ZyDAS ZD1301 driver (USB interface) * * Copyright (C) 2015 Antti Palosaari <crope@iki.fi> */ #include "dvb_usb.h" #include "zd1301_demod.h" #include "mt2060.h" #include <linux/i2c.h> #include <linux/platform_device.h> DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr); struct zd1301_dev { #define BUF_LEN 8 u8 buf[BUF_LEN]; /* bulk USB control message */ struct zd1301_demod_platform_data demod_pdata; struct mt2060_platform_data mt2060_pdata; struct platform_device *platform_device_demod; struct i2c_client *i2c_client_tuner; }; static int zd1301_ctrl_msg(struct dvb_usb_device *d, const u8 *wbuf, unsigned int wlen, u8 *rbuf, unsigned int rlen) { struct zd1301_dev *dev = d_to_priv(d); struct usb_interface *intf = d->intf; int ret, actual_length; mutex_lock(&d->usb_mutex); memcpy(&dev->buf, wbuf, wlen); dev_dbg(&intf->dev, ">>> %*ph\n", wlen, dev->buf); ret = usb_bulk_msg(d->udev, usb_sndbulkpipe(d->udev, 0x04), dev->buf, wlen, &actual_length, 1000); if (ret) { dev_err(&intf->dev, "1st usb_bulk_msg() failed %d\n", ret); goto err_mutex_unlock; } if (rlen) { ret = usb_bulk_msg(d->udev, usb_rcvbulkpipe(d->udev, 0x83), dev->buf, rlen, &actual_length, 1000); if (ret) { dev_err(&intf->dev, "2nd usb_bulk_msg() failed %d\n", ret); goto err_mutex_unlock; } dev_dbg(&intf->dev, "<<< %*ph\n", actual_length, dev->buf); if (actual_length != rlen) { /* * Chip replies often with 3 byte len stub. On that case * we have to query new reply. */ dev_dbg(&intf->dev, "repeating reply message\n"); ret = usb_bulk_msg(d->udev, usb_rcvbulkpipe(d->udev, 0x83), dev->buf, rlen, &actual_length, 1000); if (ret) { dev_err(&intf->dev, "3rd usb_bulk_msg() failed %d\n", ret); goto err_mutex_unlock; } dev_dbg(&intf->dev, "<<< %*ph\n", actual_length, dev->buf); } memcpy(rbuf, dev->buf, rlen); } err_mutex_unlock: mutex_unlock(&d->usb_mutex); return ret; } static int zd1301_demod_wreg(void *reg_priv, u16 reg, u8 val) { struct dvb_usb_device *d = reg_priv; struct usb_interface *intf = d->intf; int ret; u8 buf[7] = {0x07, 0x00, 0x03, 0x01, (reg >> 0) & 0xff, (reg >> 8) & 0xff, val}; ret = zd1301_ctrl_msg(d, buf, 7, NULL, 0); if (ret) goto err; return 0; err: dev_dbg(&intf->dev, "failed=%d\n", ret); return ret; } static int zd1301_demod_rreg(void *reg_priv, u16 reg, u8 *val) { struct dvb_usb_device *d = reg_priv; struct usb_interface *intf = d->intf; int ret; u8 buf[7] = {0x07, 0x00, 0x04, 0x01, (reg >> 0) & 0xff, (reg >> 8) & 0xff, 0}; ret = zd1301_ctrl_msg(d, buf, 7, buf, 7); if (ret) goto err; *val = buf[6]; return 0; err: dev_dbg(&intf->dev, "failed=%d\n", ret); return ret; } static int zd1301_frontend_attach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap_to_d(adap); struct zd1301_dev *dev = adap_to_priv(adap); struct usb_interface *intf = d->intf; struct platform_device *pdev; struct i2c_client *client; struct i2c_board_info board_info; struct i2c_adapter *adapter; struct dvb_frontend *frontend; int ret; dev_dbg(&intf->dev, "\n"); /* Add platform demod */ dev->demod_pdata.reg_priv = d; dev->demod_pdata.reg_read = zd1301_demod_rreg; dev->demod_pdata.reg_write = zd1301_demod_wreg; request_module("%s", "zd1301_demod"); pdev = platform_device_register_data(&intf->dev, "zd1301_demod", PLATFORM_DEVID_AUTO, &dev->demod_pdata, sizeof(dev->demod_pdata)); if (IS_ERR(pdev)) { ret = PTR_ERR(pdev); goto err; } if (!pdev->dev.driver) { ret = -ENODEV; goto err_platform_device_unregister; } if (!try_module_get(pdev->dev.driver->owner)) { ret = -ENODEV; goto err_platform_device_unregister; } adapter = zd1301_demod_get_i2c_adapter(pdev); frontend = zd1301_demod_get_dvb_frontend(pdev); if (!adapter || !frontend) { ret = -ENODEV; goto err_module_put_demod; } /* Add I2C tuner */ dev->mt2060_pdata.i2c_write_max = 9; dev->mt2060_pdata.dvb_frontend = frontend; memset(&board_info, 0, sizeof(board_info)); strscpy(board_info.type, "mt2060", I2C_NAME_SIZE); board_info.addr = 0x60; board_info.platform_data = &dev->mt2060_pdata; request_module("%s", "mt2060"); client = i2c_new_client_device(adapter, &board_info); if (!i2c_client_has_driver(client)) { ret = -ENODEV; goto err_module_put_demod; } if (!try_module_get(client->dev.driver->owner)) { ret = -ENODEV; goto err_i2c_unregister_device; } dev->platform_device_demod = pdev; dev->i2c_client_tuner = client; adap->fe[0] = frontend; return 0; err_i2c_unregister_device: i2c_unregister_device(client); err_module_put_demod: module_put(pdev->dev.driver->owner); err_platform_device_unregister: platform_device_unregister(pdev); err: dev_dbg(&intf->dev, "failed=%d\n", ret); return ret; } static int zd1301_frontend_detach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap_to_d(adap); struct zd1301_dev *dev = d_to_priv(d); struct usb_interface *intf = d->intf; struct platform_device *pdev; struct i2c_client *client; dev_dbg(&intf->dev, "\n"); client = dev->i2c_client_tuner; pdev = dev->platform_device_demod; /* Remove I2C tuner */ if (client) { module_put(client->dev.driver->owner); i2c_unregister_device(client); } /* Remove platform demod */ if (pdev) { module_put(pdev->dev.driver->owner); platform_device_unregister(pdev); } return 0; } static int zd1301_streaming_ctrl(struct dvb_frontend *fe, int onoff) { struct dvb_usb_device *d = fe_to_d(fe); struct usb_interface *intf = d->intf; int ret; u8 buf[3] = {0x03, 0x00, onoff ? 0x07 : 0x08}; dev_dbg(&intf->dev, "onoff=%d\n", onoff); ret = zd1301_ctrl_msg(d, buf, 3, NULL, 0); if (ret) goto err; return 0; err: dev_dbg(&intf->dev, "failed=%d\n", ret); return ret; } static const struct dvb_usb_device_properties zd1301_props = { .driver_name = KBUILD_MODNAME, .owner = THIS_MODULE, .adapter_nr = adapter_nr, .size_of_priv = sizeof(struct zd1301_dev), .frontend_attach = zd1301_frontend_attach, .frontend_detach = zd1301_frontend_detach, .streaming_ctrl = zd1301_streaming_ctrl, .num_adapters = 1, .adapter = { { .stream = DVB_USB_STREAM_BULK(0x81, 6, 21 * 188), }, }, }; static const struct usb_device_id zd1301_id_table[] = { {DVB_USB_DEVICE(USB_VID_ZYDAS, 0x13a1, &zd1301_props, "ZyDAS ZD1301 reference design", NULL)}, {} }; MODULE_DEVICE_TABLE(usb, zd1301_id_table); /* Usb specific object needed to register this driver with the usb subsystem */ static struct usb_driver zd1301_usb_driver = { .name = KBUILD_MODNAME, .id_table = zd1301_id_table, .probe = dvb_usbv2_probe, .disconnect = dvb_usbv2_disconnect, .suspend = dvb_usbv2_suspend, .resume = dvb_usbv2_resume, .reset_resume = dvb_usbv2_reset_resume, .no_dynamic_id = 1, .soft_unbind = 1, }; module_usb_driver(zd1301_usb_driver); MODULE_AUTHOR("Antti Palosaari <crope@iki.fi>"); MODULE_DESCRIPTION("ZyDAS ZD1301 driver"); MODULE_LICENSE("GPL"); |
7 1 5 1 16 1 14 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 | // SPDX-License-Identifier: GPL-2.0-or-later /* Kernel module to match Segment Routing Header (SRH) parameters. */ /* Author: * Ahmed Abdelsalam <amsalam20@gmail.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/ipv6.h> #include <net/seg6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6t_srh.h> #include <linux/netfilter_ipv6/ip6_tables.h> /* Test a struct->mt_invflags and a boolean for inequality */ #define NF_SRH_INVF(ptr, flag, boolean) \ ((boolean) ^ !!((ptr)->mt_invflags & (flag))) static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par) { const struct ip6t_srh *srhinfo = par->matchinfo; struct ipv6_sr_hdr *srh; struct ipv6_sr_hdr _srh; int hdrlen, srhoff = 0; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) return false; srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh); if (!srh) return false; hdrlen = ipv6_optlen(srh); if (skb->len - srhoff < hdrlen) return false; if (srh->type != IPV6_SRCRT_TYPE_4) return false; if (srh->segments_left > srh->first_segment) return false; /* Next Header matching */ if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR, !(srh->nexthdr == srhinfo->next_hdr))) return false; /* Header Extension Length matching */ if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ, !(srh->hdrlen == srhinfo->hdr_len))) return false; if (srhinfo->mt_flags & IP6T_SRH_LEN_GT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT, !(srh->hdrlen > srhinfo->hdr_len))) return false; if (srhinfo->mt_flags & IP6T_SRH_LEN_LT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT, !(srh->hdrlen < srhinfo->hdr_len))) return false; /* Segments Left matching */ if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ, !(srh->segments_left == srhinfo->segs_left))) return false; if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT, !(srh->segments_left > srhinfo->segs_left))) return false; if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT, !(srh->segments_left < srhinfo->segs_left))) return false; /** * Last Entry matching * Last_Entry field was introduced in revision 6 of the SRH draft. * It was called First_Segment in the previous revision */ if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ, !(srh->first_segment == srhinfo->last_entry))) return false; if (srhinfo->mt_flags & IP6T_SRH_LAST_GT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT, !(srh->first_segment > srhinfo->last_entry))) return false; if (srhinfo->mt_flags & IP6T_SRH_LAST_LT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT, !(srh->first_segment < srhinfo->last_entry))) return false; /** * Tag matchig * Tag field was introduced in revision 6 of the SRH draft. */ if (srhinfo->mt_flags & IP6T_SRH_TAG) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG, !(srh->tag == srhinfo->tag))) return false; return true; } static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par) { int hdrlen, psidoff, nsidoff, lsidoff, srhoff = 0; const struct ip6t_srh1 *srhinfo = par->matchinfo; struct in6_addr *psid, *nsid, *lsid; struct in6_addr _psid, _nsid, _lsid; struct ipv6_sr_hdr *srh; struct ipv6_sr_hdr _srh; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) return false; srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh); if (!srh) return false; hdrlen = ipv6_optlen(srh); if (skb->len - srhoff < hdrlen) return false; if (srh->type != IPV6_SRCRT_TYPE_4) return false; if (srh->segments_left > srh->first_segment) return false; /* Next Header matching */ if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR, !(srh->nexthdr == srhinfo->next_hdr))) return false; /* Header Extension Length matching */ if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ, !(srh->hdrlen == srhinfo->hdr_len))) return false; if (srhinfo->mt_flags & IP6T_SRH_LEN_GT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT, !(srh->hdrlen > srhinfo->hdr_len))) return false; if (srhinfo->mt_flags & IP6T_SRH_LEN_LT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT, !(srh->hdrlen < srhinfo->hdr_len))) return false; /* Segments Left matching */ if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ, !(srh->segments_left == srhinfo->segs_left))) return false; if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT, !(srh->segments_left > srhinfo->segs_left))) return false; if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT, !(srh->segments_left < srhinfo->segs_left))) return false; /** * Last Entry matching * Last_Entry field was introduced in revision 6 of the SRH draft. * It was called First_Segment in the previous revision */ if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ, !(srh->first_segment == srhinfo->last_entry))) return false; if (srhinfo->mt_flags & IP6T_SRH_LAST_GT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT, !(srh->first_segment > srhinfo->last_entry))) return false; if (srhinfo->mt_flags & IP6T_SRH_LAST_LT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT, !(srh->first_segment < srhinfo->last_entry))) return false; /** * Tag matchig * Tag field was introduced in revision 6 of the SRH draft */ if (srhinfo->mt_flags & IP6T_SRH_TAG) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG, !(srh->tag == srhinfo->tag))) return false; /* Previous SID matching */ if (srhinfo->mt_flags & IP6T_SRH_PSID) { if (srh->segments_left == srh->first_segment) return false; psidoff = srhoff + sizeof(struct ipv6_sr_hdr) + ((srh->segments_left + 1) * sizeof(struct in6_addr)); psid = skb_header_pointer(skb, psidoff, sizeof(_psid), &_psid); if (!psid) return false; if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_PSID, ipv6_masked_addr_cmp(psid, &srhinfo->psid_msk, &srhinfo->psid_addr))) return false; } /* Next SID matching */ if (srhinfo->mt_flags & IP6T_SRH_NSID) { if (srh->segments_left == 0) return false; nsidoff = srhoff + sizeof(struct ipv6_sr_hdr) + ((srh->segments_left - 1) * sizeof(struct in6_addr)); nsid = skb_header_pointer(skb, nsidoff, sizeof(_nsid), &_nsid); if (!nsid) return false; if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NSID, ipv6_masked_addr_cmp(nsid, &srhinfo->nsid_msk, &srhinfo->nsid_addr))) return false; } /* Last SID matching */ if (srhinfo->mt_flags & IP6T_SRH_LSID) { lsidoff = srhoff + sizeof(struct ipv6_sr_hdr); lsid = skb_header_pointer(skb, lsidoff, sizeof(_lsid), &_lsid); if (!lsid) return false; if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LSID, ipv6_masked_addr_cmp(lsid, &srhinfo->lsid_msk, &srhinfo->lsid_addr))) return false; } return true; } static int srh_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_srh *srhinfo = par->matchinfo; if (srhinfo->mt_flags & ~IP6T_SRH_MASK) { pr_info_ratelimited("unknown srh match flags %X\n", srhinfo->mt_flags); return -EINVAL; } if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) { pr_info_ratelimited("unknown srh invflags %X\n", srhinfo->mt_invflags); return -EINVAL; } return 0; } static int srh1_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_srh1 *srhinfo = par->matchinfo; if (srhinfo->mt_flags & ~IP6T_SRH_MASK) { pr_info_ratelimited("unknown srh match flags %X\n", srhinfo->mt_flags); return -EINVAL; } if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) { pr_info_ratelimited("unknown srh invflags %X\n", srhinfo->mt_invflags); return -EINVAL; } return 0; } static struct xt_match srh_mt6_reg[] __read_mostly = { { .name = "srh", .revision = 0, .family = NFPROTO_IPV6, .match = srh_mt6, .matchsize = sizeof(struct ip6t_srh), .checkentry = srh_mt6_check, .me = THIS_MODULE, }, { .name = "srh", .revision = 1, .family = NFPROTO_IPV6, .match = srh1_mt6, .matchsize = sizeof(struct ip6t_srh1), .checkentry = srh1_mt6_check, .me = THIS_MODULE, } }; static int __init srh_mt6_init(void) { return xt_register_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg)); } static void __exit srh_mt6_exit(void) { xt_unregister_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg)); } module_init(srh_mt6_init); module_exit(srh_mt6_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: IPv6 Segment Routing Header match"); MODULE_AUTHOR("Ahmed Abdelsalam <amsalam20@gmail.com>"); |
54 35 29 4 5 179 2253 2239 21 171 715 2058 83 9 7760 2058 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H #include <linux/sched/coredump.h> #include <linux/mm_types.h> #include <linux/fs.h> /* only for vma_is_dax() */ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); void huge_pmd_set_accessed(struct vm_fault *vmf); int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud); #else static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) { } #endif vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf); struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next); int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr); int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr); bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd); int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, unsigned long cp_flags); vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, pgprot_t pgprot, bool write); /** * vmf_insert_pfn_pmd - insert a pmd size pfn * @vmf: Structure describing the fault * @pfn: pfn to insert * @pgprot: page protection to use * @write: whether it's a write fault * * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. * * Return: vm_fault_t value. */ static inline vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) { return vmf_insert_pfn_pmd_prot(vmf, pfn, vmf->vma->vm_page_prot, write); } vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn, pgprot_t pgprot, bool write); /** * vmf_insert_pfn_pud - insert a pud size pfn * @vmf: Structure describing the fault * @pfn: pfn to insert * @pgprot: page protection to use * @write: whether it's a write fault * * Insert a pud size pfn. See vmf_insert_pfn() for additional info. * * Return: vm_fault_t value. */ static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) { return vmf_insert_pfn_pud_prot(vmf, pfn, vmf->vma->vm_page_prot, write); } enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_NEVER_DAX, TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, }; struct kobject; struct kobj_attribute; ssize_t single_hugepage_flag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count, enum transparent_hugepage_flag flag); ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag); extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define HPAGE_PMD_SHIFT PMD_SHIFT #define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) #define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) #define HPAGE_PUD_SHIFT PUD_SHIFT #define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) #define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1)) extern unsigned long transparent_hugepage_flags; static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long haddr) { /* Don't have to check pgoff for anonymous vma */ if (!vma_is_anonymous(vma)) { if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR)) return false; } if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return false; return true; } static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, unsigned long vm_flags) { /* Explicitly disabled through madvise. */ if ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; return true; } /* * to be used on vmas which are known to support THP. * Use transparent_hugepage_active otherwise */ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) { /* * If the hardware/firmware marked hugepage support disabled. */ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) return false; if (!transhuge_vma_enabled(vma, vma->vm_flags)) return false; if (vma_is_temporary_stack(vma)) return false; if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) return true; if (vma_is_dax(vma)) return true; if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)) return !!(vma->vm_flags & VM_HUGEPAGE); return false; } bool transparent_hugepage_active(struct vm_area_struct *vma); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); void prep_transhuge_page(struct page *page); void free_transhuge_page(struct page *page); bool is_transparent_hugepage(struct page *page); bool can_split_huge_page(struct page *page, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { return split_huge_page_to_list(page, NULL); } void deferred_split_huge_page(struct page *page); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct page *page); #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd) \ || pmd_devmap(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ false, NULL); \ } while (0) void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page); void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address); #define split_huge_pud(__vma, __pud, __address) \ do { \ pud_t *____pud = (__pud); \ if (pud_trans_huge(*____pud) \ || pud_devmap(*____pud)) \ __split_huge_pud(__vma, __pud, __address); \ } while (0) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); static inline int is_swap_pmd(pmd_t pmd) { return !pmd_none(pmd) && !pmd_present(pmd); } /* mmap_lock must be held on entry */ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) return __pmd_trans_huge_lock(pmd, vma); else return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { if (pud_trans_huge(*pud) || pud_devmap(*pud)) return __pud_trans_huge_lock(pud, vma); else return NULL; } /** * thp_head - Head page of a transparent huge page. * @page: Any page (tail, head or regular) found in the page cache. */ static inline struct page *thp_head(struct page *page) { return compound_head(page); } /** * thp_order - Order of a transparent huge page. * @page: Head page of a transparent huge page. */ static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); if (PageHead(page)) return HPAGE_PMD_ORDER; return 0; } /** * thp_nr_pages - The number of regular pages in this huge page. * @page: The head page of a huge page. */ static inline int thp_nr_pages(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); if (PageHead(page)) return HPAGE_PMD_NR; return 1; } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap); struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap); vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); extern struct page *huge_zero_page; extern unsigned long huge_zero_pfn; static inline bool is_huge_zero_page(struct page *page) { return READ_ONCE(huge_zero_page) == page; } static inline bool is_huge_zero_pmd(pmd_t pmd) { return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd); } static inline bool is_huge_zero_pud(pud_t pud) { return false; } struct page *mm_get_huge_zero_page(struct mm_struct *mm); void mm_put_huge_zero_page(struct mm_struct *mm); #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) static inline bool thp_migration_supported(void) { return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } static inline struct list_head *page_deferred_list(struct page *page) { /* * Global or memcg deferred list in the second tail pages is * occupied by compound_head. */ return &page[2].deferred_list; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; }) static inline struct page *thp_head(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return page; } static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return 0; } static inline int thp_nr_pages(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return 1; } static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) { return false; } static inline bool transparent_hugepage_active(struct vm_area_struct *vma) { return false; } static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long haddr) { return false; } static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, unsigned long vm_flags) { return false; } static inline void prep_transhuge_page(struct page *page) {} static inline bool is_transparent_hugepage(struct page *page) { return false; } #define transparent_hugepage_flags 0UL #define thp_get_unmapped_area NULL static inline bool can_split_huge_page(struct page *page, int *pextra_pins) { BUILD_BUG(); return false; } static inline int split_huge_page_to_list(struct page *page, struct list_head *list) { return 0; } static inline int split_huge_page(struct page *page) { return 0; } static inline void deferred_split_huge_page(struct page *page) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct page *page) {} static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page) {} #define split_huge_pud(__vma, __pmd, __address) \ do { } while (0) static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { BUG(); return 0; } static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next) { } static inline int is_swap_pmd(pmd_t pmd) { return 0; } static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { return NULL; } static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { return 0; } static inline bool is_huge_zero_page(struct page *page) { return false; } static inline bool is_huge_zero_pmd(pmd_t pmd) { return false; } static inline bool is_huge_zero_pud(pud_t pud) { return false; } static inline void mm_put_huge_zero_page(struct mm_struct *mm) { return; } static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { return NULL; } static inline struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap) { return NULL; } static inline bool thp_migration_supported(void) { return false; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. * * Return: Number of bytes in this page. */ static inline unsigned long thp_size(struct page *page) { return PAGE_SIZE << thp_order(page); } #endif /* _LINUX_HUGE_MM_H */ |
8 8 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 | // SPDX-License-Identifier: GPL-2.0-only /* * The NFC Controller Interface is the communication protocol between an * NFC Controller (NFCC) and a Device Host (DH). * This is the HCI over NCI implementation, as specified in the 10.2 * section of the NCI 1.1 specification. * * Copyright (C) 2014 STMicroelectronics SAS. All rights reserved. */ #include <linux/skbuff.h> #include "../nfc.h" #include <net/nfc/nci.h> #include <net/nfc/nci_core.h> #include <linux/nfc.h> struct nci_data { u8 conn_id; u8 pipe; u8 cmd; const u8 *data; u32 data_len; } __packed; struct nci_hci_create_pipe_params { u8 src_gate; u8 dest_host; u8 dest_gate; } __packed; struct nci_hci_create_pipe_resp { u8 src_host; u8 src_gate; u8 dest_host; u8 dest_gate; u8 pipe; } __packed; struct nci_hci_delete_pipe_noti { u8 pipe; } __packed; struct nci_hci_all_pipe_cleared_noti { u8 host; } __packed; struct nci_hcp_message { u8 header; /* type -cmd,evt,rsp- + instruction */ u8 data[]; } __packed; struct nci_hcp_packet { u8 header; /* cbit+pipe */ struct nci_hcp_message message; } __packed; #define NCI_HCI_ANY_SET_PARAMETER 0x01 #define NCI_HCI_ANY_GET_PARAMETER 0x02 #define NCI_HCI_ANY_CLOSE_PIPE 0x04 #define NCI_HCI_ADM_CLEAR_ALL_PIPE 0x14 #define NCI_HFP_NO_CHAINING 0x80 #define NCI_NFCEE_ID_HCI 0x80 #define NCI_EVT_HOT_PLUG 0x03 #define NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY 0x01 #define NCI_HCI_ADM_CREATE_PIPE 0x10 #define NCI_HCI_ADM_DELETE_PIPE 0x11 /* HCP headers */ #define NCI_HCI_HCP_PACKET_HEADER_LEN 1 #define NCI_HCI_HCP_MESSAGE_HEADER_LEN 1 #define NCI_HCI_HCP_HEADER_LEN 2 /* HCP types */ #define NCI_HCI_HCP_COMMAND 0x00 #define NCI_HCI_HCP_EVENT 0x01 #define NCI_HCI_HCP_RESPONSE 0x02 #define NCI_HCI_ADM_NOTIFY_PIPE_CREATED 0x12 #define NCI_HCI_ADM_NOTIFY_PIPE_DELETED 0x13 #define NCI_HCI_ADM_NOTIFY_ALL_PIPE_CLEARED 0x15 #define NCI_HCI_FRAGMENT 0x7f #define NCI_HCP_HEADER(type, instr) ((((type) & 0x03) << 6) |\ ((instr) & 0x3f)) #define NCI_HCP_MSG_GET_TYPE(header) ((header & 0xc0) >> 6) #define NCI_HCP_MSG_GET_CMD(header) (header & 0x3f) #define NCI_HCP_MSG_GET_PIPE(header) (header & 0x7f) static int nci_hci_result_to_errno(u8 result) { switch (result) { case NCI_HCI_ANY_OK: return 0; case NCI_HCI_ANY_E_REG_PAR_UNKNOWN: return -EOPNOTSUPP; case NCI_HCI_ANY_E_TIMEOUT: return -ETIME; default: return -1; } } /* HCI core */ static void nci_hci_reset_pipes(struct nci_hci_dev *hdev) { int i; for (i = 0; i < NCI_HCI_MAX_PIPES; i++) { hdev->pipes[i].gate = NCI_HCI_INVALID_GATE; hdev->pipes[i].host = NCI_HCI_INVALID_HOST; } memset(hdev->gate2pipe, NCI_HCI_INVALID_PIPE, sizeof(hdev->gate2pipe)); } static void nci_hci_reset_pipes_per_host(struct nci_dev *ndev, u8 host) { int i; for (i = 0; i < NCI_HCI_MAX_PIPES; i++) { if (ndev->hci_dev->pipes[i].host == host) { ndev->hci_dev->pipes[i].gate = NCI_HCI_INVALID_GATE; ndev->hci_dev->pipes[i].host = NCI_HCI_INVALID_HOST; } } } /* Fragment HCI data over NCI packet. * NFC Forum NCI 10.2.2 Data Exchange: * The payload of the Data Packets sent on the Logical Connection SHALL be * valid HCP packets, as defined within [ETSI_102622]. Each Data Packet SHALL * contain a single HCP packet. NCI Segmentation and Reassembly SHALL NOT be * applied to Data Messages in either direction. The HCI fragmentation mechanism * is used if required. */ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, const u8 data_type, const u8 *data, size_t data_len) { const struct nci_conn_info *conn_info; struct sk_buff *skb; int len, i, r; u8 cb = pipe; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; i = 0; skb = nci_skb_alloc(ndev, conn_info->max_pkt_payload_len + NCI_DATA_HDR_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; skb_reserve(skb, NCI_DATA_HDR_SIZE + 2); *(u8 *)skb_push(skb, 1) = data_type; do { /* If last packet add NCI_HFP_NO_CHAINING */ if (i + conn_info->max_pkt_payload_len - (skb->len + 1) >= data_len) { cb |= NCI_HFP_NO_CHAINING; len = data_len - i; } else { len = conn_info->max_pkt_payload_len - skb->len - 1; } *(u8 *)skb_push(skb, 1) = cb; if (len > 0) skb_put_data(skb, data + i, len); r = nci_send_data(ndev, conn_info->conn_id, skb); if (r < 0) return r; i += len; if (i < data_len) { skb = nci_skb_alloc(ndev, conn_info->max_pkt_payload_len + NCI_DATA_HDR_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; skb_reserve(skb, NCI_DATA_HDR_SIZE + 1); } } while (i < data_len); return i; } static void nci_hci_send_data_req(struct nci_dev *ndev, const void *opt) { const struct nci_data *data = opt; nci_hci_send_data(ndev, data->pipe, data->cmd, data->data, data->data_len); } int nci_hci_send_event(struct nci_dev *ndev, u8 gate, u8 event, const u8 *param, size_t param_len) { u8 pipe = ndev->hci_dev->gate2pipe[gate]; if (pipe == NCI_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; return nci_hci_send_data(ndev, pipe, NCI_HCP_HEADER(NCI_HCI_HCP_EVENT, event), param, param_len); } EXPORT_SYMBOL(nci_hci_send_event); int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd, const u8 *param, size_t param_len, struct sk_buff **skb) { const struct nci_hcp_message *message; const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 pipe = ndev->hci_dev->gate2pipe[gate]; if (pipe == NCI_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; data.conn_id = conn_info->conn_id; data.pipe = pipe; data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, cmd); data.data = param; data.data_len = param_len; r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { message = (struct nci_hcp_message *)conn_info->rx_skb->data; r = nci_hci_result_to_errno( NCI_HCP_MSG_GET_CMD(message->header)); skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); if (!r && skb) *skb = conn_info->rx_skb; } return r; } EXPORT_SYMBOL(nci_hci_send_cmd); int nci_hci_clear_all_pipes(struct nci_dev *ndev) { int r; r = nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADM_CLEAR_ALL_PIPE, NULL, 0, NULL); if (r < 0) return r; nci_hci_reset_pipes(ndev->hci_dev); return r; } EXPORT_SYMBOL(nci_hci_clear_all_pipes); static void nci_hci_event_received(struct nci_dev *ndev, u8 pipe, u8 event, struct sk_buff *skb) { if (ndev->ops->hci_event_received) ndev->ops->hci_event_received(ndev, pipe, event, skb); } static void nci_hci_cmd_received(struct nci_dev *ndev, u8 pipe, u8 cmd, struct sk_buff *skb) { u8 gate = ndev->hci_dev->pipes[pipe].gate; u8 status = NCI_HCI_ANY_OK | ~NCI_HCI_FRAGMENT; u8 dest_gate, new_pipe; struct nci_hci_create_pipe_resp *create_info; struct nci_hci_delete_pipe_noti *delete_info; struct nci_hci_all_pipe_cleared_noti *cleared_info; pr_debug("from gate %x pipe %x cmd %x\n", gate, pipe, cmd); switch (cmd) { case NCI_HCI_ADM_NOTIFY_PIPE_CREATED: if (skb->len != 5) { status = NCI_HCI_ANY_E_NOK; goto exit; } create_info = (struct nci_hci_create_pipe_resp *)skb->data; dest_gate = create_info->dest_gate; new_pipe = create_info->pipe; if (new_pipe >= NCI_HCI_MAX_PIPES) { status = NCI_HCI_ANY_E_NOK; goto exit; } /* Save the new created pipe and bind with local gate, * the description for skb->data[3] is destination gate id * but since we received this cmd from host controller, we * are the destination and it is our local gate */ ndev->hci_dev->gate2pipe[dest_gate] = new_pipe; ndev->hci_dev->pipes[new_pipe].gate = dest_gate; ndev->hci_dev->pipes[new_pipe].host = create_info->src_host; break; case NCI_HCI_ANY_OPEN_PIPE: /* If the pipe is not created report an error */ if (gate == NCI_HCI_INVALID_GATE) { status = NCI_HCI_ANY_E_NOK; goto exit; } break; case NCI_HCI_ADM_NOTIFY_PIPE_DELETED: if (skb->len != 1) { status = NCI_HCI_ANY_E_NOK; goto exit; } delete_info = (struct nci_hci_delete_pipe_noti *)skb->data; if (delete_info->pipe >= NCI_HCI_MAX_PIPES) { status = NCI_HCI_ANY_E_NOK; goto exit; } ndev->hci_dev->pipes[delete_info->pipe].gate = NCI_HCI_INVALID_GATE; ndev->hci_dev->pipes[delete_info->pipe].host = NCI_HCI_INVALID_HOST; break; case NCI_HCI_ADM_NOTIFY_ALL_PIPE_CLEARED: if (skb->len != 1) { status = NCI_HCI_ANY_E_NOK; goto exit; } cleared_info = (struct nci_hci_all_pipe_cleared_noti *)skb->data; nci_hci_reset_pipes_per_host(ndev, cleared_info->host); break; default: pr_debug("Discarded unknown cmd %x to gate %x\n", cmd, gate); break; } if (ndev->ops->hci_cmd_received) ndev->ops->hci_cmd_received(ndev, pipe, cmd, skb); exit: nci_hci_send_data(ndev, pipe, status, NULL, 0); kfree_skb(skb); } static void nci_hci_resp_received(struct nci_dev *ndev, u8 pipe, struct sk_buff *skb) { struct nci_conn_info *conn_info; conn_info = ndev->hci_dev->conn_info; if (!conn_info) goto exit; conn_info->rx_skb = skb; exit: nci_req_complete(ndev, NCI_STATUS_OK); } /* Receive hcp message for pipe, with type and cmd. * skb contains optional message data only. */ static void nci_hci_hcp_message_rx(struct nci_dev *ndev, u8 pipe, u8 type, u8 instruction, struct sk_buff *skb) { switch (type) { case NCI_HCI_HCP_RESPONSE: nci_hci_resp_received(ndev, pipe, skb); break; case NCI_HCI_HCP_COMMAND: nci_hci_cmd_received(ndev, pipe, instruction, skb); break; case NCI_HCI_HCP_EVENT: nci_hci_event_received(ndev, pipe, instruction, skb); break; default: pr_err("UNKNOWN MSG Type %d, instruction=%d\n", type, instruction); kfree_skb(skb); break; } nci_req_complete(ndev, NCI_STATUS_OK); } static void nci_hci_msg_rx_work(struct work_struct *work) { struct nci_hci_dev *hdev = container_of(work, struct nci_hci_dev, msg_rx_work); struct sk_buff *skb; const struct nci_hcp_message *message; u8 pipe, type, instruction; while ((skb = skb_dequeue(&hdev->msg_rx_queue)) != NULL) { pipe = NCI_HCP_MSG_GET_PIPE(skb->data[0]); skb_pull(skb, NCI_HCI_HCP_PACKET_HEADER_LEN); message = (struct nci_hcp_message *)skb->data; type = NCI_HCP_MSG_GET_TYPE(message->header); instruction = NCI_HCP_MSG_GET_CMD(message->header); skb_pull(skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); nci_hci_hcp_message_rx(hdev->ndev, pipe, type, instruction, skb); } } void nci_hci_data_received_cb(void *context, struct sk_buff *skb, int err) { struct nci_dev *ndev = (struct nci_dev *)context; struct nci_hcp_packet *packet; u8 pipe, type; struct sk_buff *hcp_skb; struct sk_buff *frag_skb; int msg_len; pr_debug("\n"); if (err) { nci_req_complete(ndev, err); return; } packet = (struct nci_hcp_packet *)skb->data; if ((packet->header & ~NCI_HCI_FRAGMENT) == 0) { skb_queue_tail(&ndev->hci_dev->rx_hcp_frags, skb); return; } /* it's the last fragment. Does it need re-aggregation? */ if (skb_queue_len(&ndev->hci_dev->rx_hcp_frags)) { pipe = NCI_HCP_MSG_GET_PIPE(packet->header); skb_queue_tail(&ndev->hci_dev->rx_hcp_frags, skb); msg_len = 0; skb_queue_walk(&ndev->hci_dev->rx_hcp_frags, frag_skb) { msg_len += (frag_skb->len - NCI_HCI_HCP_PACKET_HEADER_LEN); } hcp_skb = nfc_alloc_recv_skb(NCI_HCI_HCP_PACKET_HEADER_LEN + msg_len, GFP_KERNEL); if (!hcp_skb) { nci_req_complete(ndev, -ENOMEM); return; } skb_put_u8(hcp_skb, pipe); skb_queue_walk(&ndev->hci_dev->rx_hcp_frags, frag_skb) { msg_len = frag_skb->len - NCI_HCI_HCP_PACKET_HEADER_LEN; skb_put_data(hcp_skb, frag_skb->data + NCI_HCI_HCP_PACKET_HEADER_LEN, msg_len); } skb_queue_purge(&ndev->hci_dev->rx_hcp_frags); } else { packet->header &= NCI_HCI_FRAGMENT; hcp_skb = skb; } /* if this is a response, dispatch immediately to * unblock waiting cmd context. Otherwise, enqueue to dispatch * in separate context where handler can also execute command. */ packet = (struct nci_hcp_packet *)hcp_skb->data; type = NCI_HCP_MSG_GET_TYPE(packet->message.header); if (type == NCI_HCI_HCP_RESPONSE) { pipe = NCI_HCP_MSG_GET_PIPE(packet->header); skb_pull(hcp_skb, NCI_HCI_HCP_PACKET_HEADER_LEN); nci_hci_hcp_message_rx(ndev, pipe, type, NCI_STATUS_OK, hcp_skb); } else { skb_queue_tail(&ndev->hci_dev->msg_rx_queue, hcp_skb); schedule_work(&ndev->hci_dev->msg_rx_work); } } int nci_hci_open_pipe(struct nci_dev *ndev, u8 pipe) { struct nci_data data; const struct nci_conn_info *conn_info; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; data.conn_id = conn_info->conn_id; data.pipe = pipe; data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, NCI_HCI_ANY_OPEN_PIPE); data.data = NULL; data.data_len = 0; return nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); } EXPORT_SYMBOL(nci_hci_open_pipe); static u8 nci_hci_create_pipe(struct nci_dev *ndev, u8 dest_host, u8 dest_gate, int *result) { u8 pipe; struct sk_buff *skb; struct nci_hci_create_pipe_params params; const struct nci_hci_create_pipe_resp *resp; pr_debug("gate=%d\n", dest_gate); params.src_gate = NCI_HCI_ADMIN_GATE; params.dest_host = dest_host; params.dest_gate = dest_gate; *result = nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADM_CREATE_PIPE, (u8 *)¶ms, sizeof(params), &skb); if (*result < 0) return NCI_HCI_INVALID_PIPE; resp = (struct nci_hci_create_pipe_resp *)skb->data; pipe = resp->pipe; kfree_skb(skb); pr_debug("pipe created=%d\n", pipe); if (pipe >= NCI_HCI_MAX_PIPES) pipe = NCI_HCI_INVALID_PIPE; return pipe; } static int nci_hci_delete_pipe(struct nci_dev *ndev, u8 pipe) { pr_debug("\n"); return nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADM_DELETE_PIPE, &pipe, 1, NULL); } int nci_hci_set_param(struct nci_dev *ndev, u8 gate, u8 idx, const u8 *param, size_t param_len) { const struct nci_hcp_message *message; const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 *tmp; u8 pipe = ndev->hci_dev->gate2pipe[gate]; pr_debug("idx=%d to gate %d\n", idx, gate); if (pipe == NCI_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; tmp = kmalloc(1 + param_len, GFP_KERNEL); if (!tmp) return -ENOMEM; *tmp = idx; memcpy(tmp + 1, param, param_len); data.conn_id = conn_info->conn_id; data.pipe = pipe; data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, NCI_HCI_ANY_SET_PARAMETER); data.data = tmp; data.data_len = param_len + 1; r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { message = (struct nci_hcp_message *)conn_info->rx_skb->data; r = nci_hci_result_to_errno( NCI_HCP_MSG_GET_CMD(message->header)); skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); } kfree(tmp); return r; } EXPORT_SYMBOL(nci_hci_set_param); int nci_hci_get_param(struct nci_dev *ndev, u8 gate, u8 idx, struct sk_buff **skb) { const struct nci_hcp_message *message; const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 pipe = ndev->hci_dev->gate2pipe[gate]; pr_debug("idx=%d to gate %d\n", idx, gate); if (pipe == NCI_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; data.conn_id = conn_info->conn_id; data.pipe = pipe; data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, NCI_HCI_ANY_GET_PARAMETER); data.data = &idx; data.data_len = 1; r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { message = (struct nci_hcp_message *)conn_info->rx_skb->data; r = nci_hci_result_to_errno( NCI_HCP_MSG_GET_CMD(message->header)); skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); if (!r && skb) *skb = conn_info->rx_skb; } return r; } EXPORT_SYMBOL(nci_hci_get_param); int nci_hci_connect_gate(struct nci_dev *ndev, u8 dest_host, u8 dest_gate, u8 pipe) { bool pipe_created = false; int r; if (pipe == NCI_HCI_DO_NOT_OPEN_PIPE) return 0; if (ndev->hci_dev->gate2pipe[dest_gate] != NCI_HCI_INVALID_PIPE) return -EADDRINUSE; if (pipe != NCI_HCI_INVALID_PIPE) goto open_pipe; switch (dest_gate) { case NCI_HCI_LINK_MGMT_GATE: pipe = NCI_HCI_LINK_MGMT_PIPE; break; case NCI_HCI_ADMIN_GATE: pipe = NCI_HCI_ADMIN_PIPE; break; default: pipe = nci_hci_create_pipe(ndev, dest_host, dest_gate, &r); if (pipe == NCI_HCI_INVALID_PIPE) return r; pipe_created = true; break; } open_pipe: r = nci_hci_open_pipe(ndev, pipe); if (r < 0) { if (pipe_created) { if (nci_hci_delete_pipe(ndev, pipe) < 0) { /* TODO: Cannot clean by deleting pipe... * -> inconsistent state */ } } return r; } ndev->hci_dev->pipes[pipe].gate = dest_gate; ndev->hci_dev->pipes[pipe].host = dest_host; ndev->hci_dev->gate2pipe[dest_gate] = pipe; return 0; } EXPORT_SYMBOL(nci_hci_connect_gate); static int nci_hci_dev_connect_gates(struct nci_dev *ndev, u8 gate_count, const struct nci_hci_gate *gates) { int r; while (gate_count--) { r = nci_hci_connect_gate(ndev, gates->dest_host, gates->gate, gates->pipe); if (r < 0) return r; gates++; } return 0; } int nci_hci_dev_session_init(struct nci_dev *ndev) { struct nci_conn_info *conn_info; struct sk_buff *skb; int r; ndev->hci_dev->count_pipes = 0; ndev->hci_dev->expected_pipes = 0; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; conn_info->data_exchange_cb = nci_hci_data_received_cb; conn_info->data_exchange_cb_context = ndev; nci_hci_reset_pipes(ndev->hci_dev); if (ndev->hci_dev->init_data.gates[0].gate != NCI_HCI_ADMIN_GATE) return -EPROTO; r = nci_hci_connect_gate(ndev, ndev->hci_dev->init_data.gates[0].dest_host, ndev->hci_dev->init_data.gates[0].gate, ndev->hci_dev->init_data.gates[0].pipe); if (r < 0) return r; r = nci_hci_get_param(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY, &skb); if (r < 0) return r; if (skb->len && skb->len == strlen(ndev->hci_dev->init_data.session_id) && !memcmp(ndev->hci_dev->init_data.session_id, skb->data, skb->len) && ndev->ops->hci_load_session) { /* Restore gate<->pipe table from some proprietary location. */ r = ndev->ops->hci_load_session(ndev); } else { r = nci_hci_clear_all_pipes(ndev); if (r < 0) goto exit; r = nci_hci_dev_connect_gates(ndev, ndev->hci_dev->init_data.gate_count, ndev->hci_dev->init_data.gates); if (r < 0) goto exit; r = nci_hci_set_param(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY, ndev->hci_dev->init_data.session_id, strlen(ndev->hci_dev->init_data.session_id)); } exit: kfree_skb(skb); return r; } EXPORT_SYMBOL(nci_hci_dev_session_init); struct nci_hci_dev *nci_hci_allocate(struct nci_dev *ndev) { struct nci_hci_dev *hdev; hdev = kzalloc(sizeof(*hdev), GFP_KERNEL); if (!hdev) return NULL; skb_queue_head_init(&hdev->rx_hcp_frags); INIT_WORK(&hdev->msg_rx_work, nci_hci_msg_rx_work); skb_queue_head_init(&hdev->msg_rx_queue); hdev->ndev = ndev; return hdev; } void nci_hci_deallocate(struct nci_dev *ndev) { kfree(ndev->hci_dev); } |
15 15 12960 12970 5 5 5 1 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * No idle tick implementation for low and high resolution timers * * Started by: Thomas Gleixner and Ingo Molnar */ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/percpu.h> #include <linux/nmi.h> #include <linux/profile.h> #include <linux/sched/signal.h> #include <linux/sched/clock.h> #include <linux/sched/stat.h> #include <linux/sched/nohz.h> #include <linux/sched/loadavg.h> #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> #include <linux/context_tracking.h> #include <linux/mm.h> #include <asm/irq_regs.h> #include "tick-internal.h" #include <trace/events/timer.h> /* * Per-CPU nohz control structure */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) /* * The time, when the last jiffy update happened. Write access must hold * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a * consistent view of jiffies and last_jiffies_update. */ static ktime_t last_jiffies_update; /* * Must be called with interrupts disabled ! */ static void tick_do_update_jiffies64(ktime_t now) { unsigned long ticks = 1; ktime_t delta, nextp; /* * 64bit can do a quick check without holding jiffies lock and * without looking at the sequence count. The smp_load_acquire() * pairs with the update done later in this function. * * 32bit cannot do that because the store of tick_next_period * consists of two 32bit stores and the first store could move it * to a random point in the future. */ if (IS_ENABLED(CONFIG_64BIT)) { if (ktime_before(now, smp_load_acquire(&tick_next_period))) return; } else { unsigned int seq; /* * Avoid contention on jiffies_lock and protect the quick * check with the sequence count. */ do { seq = read_seqcount_begin(&jiffies_seq); nextp = tick_next_period; } while (read_seqcount_retry(&jiffies_seq, seq)); if (ktime_before(now, nextp)) return; } /* Quick check failed, i.e. update is required. */ raw_spin_lock(&jiffies_lock); /* * Reevaluate with the lock held. Another CPU might have done the * update already. */ if (ktime_before(now, tick_next_period)) { raw_spin_unlock(&jiffies_lock); return; } write_seqcount_begin(&jiffies_seq); delta = ktime_sub(now, tick_next_period); if (unlikely(delta >= TICK_NSEC)) { /* Slow path for long idle sleep times */ s64 incr = TICK_NSEC; ticks += ktime_divns(delta, incr); last_jiffies_update = ktime_add_ns(last_jiffies_update, incr * ticks); } else { last_jiffies_update = ktime_add_ns(last_jiffies_update, TICK_NSEC); } /* Advance jiffies to complete the jiffies_seq protected job */ jiffies_64 += ticks; /* * Keep the tick_next_period variable up to date. */ nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); if (IS_ENABLED(CONFIG_64BIT)) { /* * Pairs with smp_load_acquire() in the lockless quick * check above and ensures that the update to jiffies_64 is * not reordered vs. the store to tick_next_period, neither * by the compiler nor by the CPU. */ smp_store_release(&tick_next_period, nextp); } else { /* * A plain store is good enough on 32bit as the quick check * above is protected by the sequence count. */ tick_next_period = nextp; } /* * Release the sequence count. calc_global_load() below is not * protected by it, but jiffies_lock needs to be held to prevent * concurrent invocations. */ write_seqcount_end(&jiffies_seq); calc_global_load(); raw_spin_unlock(&jiffies_lock); update_wall_time(); } /* * Initialize and return retrieve the jiffies update. */ static ktime_t tick_init_jiffy_update(void) { ktime_t period; raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Did we start the jiffies update yet ? */ if (last_jiffies_update == 0) { u32 rem; /* * Ensure that the tick is aligned to a multiple of * TICK_NSEC. */ div_u64_rem(tick_next_period, TICK_NSEC, &rem); if (rem) tick_next_period += TICK_NSEC - rem; last_jiffies_update = tick_next_period; } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); return period; } #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int cpu = smp_processor_id(); #ifdef CONFIG_NO_HZ_COMMON /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the CPU in charge went * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * jiffies_lock. * * If nohz_full is enabled, this should not happen because the * tick_do_timer_cpu never relinquishes. */ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL WARN_ON_ONCE(tick_nohz_full_running); #endif tick_do_timer_cpu = cpu; } #endif /* Check, if the jiffies need an update */ if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now); /* * If jiffies update stalled for too long (timekeeper in stop_machine() * or VMEXIT'ed for several msecs), force an update. */ if (ts->last_tick_jiffies != jiffies) { ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { tick_do_update_jiffies64(now); ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } } if (ts->inidle) ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { #ifdef CONFIG_NO_HZ_COMMON /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long * time. This happens on complete idle SMP systems while * waiting on the login prompt. We also increment the "start of * idle" jiffy stamp so the idle accounting adjustment we do * when we go busy again does not account too much ticks. */ if (ts->tick_stopped) { touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; /* * In case the current tick fired too early past its expected * expiration, make sure we don't bypass the next clock reprogramming * to the same deadline. */ ts->next_tick = 0; } #endif update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } #endif #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; EXPORT_SYMBOL_GPL(tick_nohz_full_mask); bool tick_nohz_full_running; EXPORT_SYMBOL_GPL(tick_nohz_full_running); static atomic_t tick_dep_mask; static bool check_tick_dependency(atomic_t *dep) { int val = atomic_read(dep); if (val & TICK_DEP_MASK_POSIX_TIMER) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return true; } if (val & TICK_DEP_MASK_PERF_EVENTS) { trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); return true; } if (val & TICK_DEP_MASK_SCHED) { trace_tick_stop(0, TICK_DEP_MASK_SCHED); return true; } if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); return true; } if (val & TICK_DEP_MASK_RCU) { trace_tick_stop(0, TICK_DEP_MASK_RCU); return true; } if (val & TICK_DEP_MASK_RCU_EXP) { trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP); return true; } return false; } static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { lockdep_assert_irqs_disabled(); if (unlikely(!cpu_online(cpu))) return false; if (check_tick_dependency(&tick_dep_mask)) return false; if (check_tick_dependency(&ts->tick_dep_mask)) return false; if (check_tick_dependency(¤t->tick_dep_mask)) return false; if (check_tick_dependency(¤t->signal->tick_dep_mask)) return false; return true; } static void nohz_full_kick_func(struct irq_work *work) { /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = IRQ_WORK_INIT_HARD(nohz_full_kick_func); /* * Kick this CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), * is NMI safe. */ static void tick_nohz_full_kick(void) { if (!tick_nohz_full_cpu(smp_processor_id())) return; irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); } /* * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. */ void tick_nohz_full_kick_cpu(int cpu) { if (!tick_nohz_full_cpu(cpu)) return; irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } static void tick_nohz_kick_task(struct task_struct *tsk) { int cpu; /* * If the task is not running, run_posix_cpu_timers() * has nothing to elapse, IPI can then be spared. * * activate_task() STORE p->tick_dep_mask * STORE p->on_rq * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or()) * LOCK rq->lock LOAD p->on_rq * smp_mb__after_spin_lock() * tick_nohz_task_switch() * LOAD p->tick_dep_mask */ if (!sched_task_on_rq(tsk)) return; /* * If the task concurrently migrates to another CPU, * we guarantee it sees the new tick dependency upon * schedule. * * set_task_cpu(p, cpu); * STORE p->cpu = @cpu * __schedule() (switch to task 'p') * LOCK rq->lock * smp_mb__after_spin_lock() STORE p->tick_dep_mask * tick_nohz_task_switch() smp_mb() (atomic_fetch_or()) * LOAD p->tick_dep_mask LOAD p->cpu */ cpu = task_cpu(tsk); preempt_disable(); if (cpu_online(cpu)) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ static void tick_nohz_full_kick_all(void) { int cpu; if (!tick_nohz_full_running) return; preempt_disable(); for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } static void tick_nohz_dep_set_all(atomic_t *dep, enum tick_dep_bits bit) { int prev; prev = atomic_fetch_or(BIT(bit), dep); if (!prev) tick_nohz_full_kick_all(); } /* * Set a global tick dependency. Used by perf events that rely on freq and * by unstable clock. */ void tick_nohz_dep_set(enum tick_dep_bits bit) { tick_nohz_dep_set_all(&tick_dep_mask, bit); } void tick_nohz_dep_clear(enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tick_dep_mask); } /* * Set per-CPU tick dependency. Used by scheduler and perf events in order to * manage events throttling. */ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { int prev; struct tick_sched *ts; ts = per_cpu_ptr(&tick_cpu_sched, cpu); prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); if (!prev) { preempt_disable(); /* Perf needs local kick that is NMI safe */ if (cpu == smp_processor_id()) { tick_nohz_full_kick(); } else { /* Remote irq work not NMI-safe */ if (!WARN_ON_ONCE(in_nmi())) tick_nohz_full_kick_cpu(cpu); } preempt_enable(); } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); atomic_andnot(BIT(bit), &ts->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* * Set a per-task tick dependency. RCU need this. Also posix CPU timers * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) tick_nohz_kick_task(tsk); } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tsk->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); /* * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse * per process timers. */ void tick_nohz_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit) { int prev; struct signal_struct *sig = tsk->signal; prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask); if (!prev) { struct task_struct *t; lockdep_assert_held(&tsk->sighand->siglock); __for_each_thread(sig, t) tick_nohz_kick_task(t); } } void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &sig->tick_dep_mask); } /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: * perf events, posix CPU timers, ... */ void __tick_nohz_task_switch(void) { struct tick_sched *ts; if (!tick_nohz_full_cpu(smp_processor_id())) return; ts = this_cpu_ptr(&tick_cpu_sched); if (ts->tick_stopped) { if (atomic_read(¤t->tick_dep_mask) || atomic_read(¤t->signal->tick_dep_mask)) tick_nohz_full_kick(); } } /* Get the boot-time nohz CPU list from the kernel parameters. */ void __init tick_nohz_full_setup(cpumask_var_t cpumask) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; } bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { /* * The tick_do_timer_cpu CPU handles housekeeping duty (unbound * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && tick_do_timer_cpu == cpu) return false; return true; } static int tick_nohz_cpu_down(unsigned int cpu) { return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY; } void __init tick_nohz_init(void) { int cpu, ret; if (!tick_nohz_full_running) return; /* * Full dynticks uses irq work to drive the tick rescheduling on safe * locking contexts. But then we need irq work to raise its own * interrupts to avoid circular dependency on the tick */ if (!arch_irq_work_has_interrupt()) { pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); tick_nohz_full_running = false; return; } if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { pr_warn("NO_HZ: Clearing %d from nohz_full range " "for timekeeping\n", cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } } for_each_cpu(cpu, tick_nohz_full_mask) context_tracking_cpu_set(cpu); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "kernel/nohz:predown", NULL, tick_nohz_cpu_down); WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); } #endif /* * NOHZ - aka dynamic tick functionality */ #ifdef CONFIG_NO_HZ_COMMON /* * NO HZ enabled ? */ bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); bool tick_nohz_tick_stopped(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return ts->tick_stopped; } bool tick_nohz_tick_stopped_cpu(int cpu) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); return ts->tick_stopped; } /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * * Called from interrupt entry when the CPU was idle * * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy * value. We do this unconditionally on any CPU, as we don't know whether the * CPU, which has the update task assigned is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { unsigned long flags; __this_cpu_write(tick_cpu_sched.idle_waketime, now); local_irq_save(flags); tick_do_update_jiffies64(now); local_irq_restore(flags); touch_softlockup_watchdog_sched(); } /* * Updates the per-CPU time idle statistics counters */ static void update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) { ktime_t delta; if (ts->idle_active) { delta = ktime_sub(now, ts->idle_entrytime); if (nr_iowait_cpu(cpu) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); else ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ts->idle_entrytime = now; } if (last_update_time) *last_update_time = ktime_to_us(now); } static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { update_ts_time_stats(smp_processor_id(), ts, now, NULL); ts->idle_active = 0; sched_clock_idle_wakeup_event(); } static void tick_nohz_start_idle(struct tick_sched *ts) { ts->idle_entrytime = ktime_get(); ts->idle_active = 1; sched_clock_idle_sleep_event(); } /** * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative idle time (since boot) for a given * CPU, in microseconds. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * This function returns -1 if NOHZ is not enabled. */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, idle; if (!tick_nohz_active) return -1; now = ktime_get(); if (last_update_time) { update_ts_time_stats(cpu, ts, now, last_update_time); idle = ts->idle_sleeptime; } else { if (ts->idle_active && !nr_iowait_cpu(cpu)) { ktime_t delta = ktime_sub(now, ts->idle_entrytime); idle = ktime_add(ts->idle_sleeptime, delta); } else { idle = ts->idle_sleeptime; } } return ktime_to_us(idle); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * get_cpu_iowait_time_us - get the total iowait time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative iowait time (since boot) for a given * CPU, in microseconds. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * This function returns -1 if NOHZ is not enabled. */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, iowait; if (!tick_nohz_active) return -1; now = ktime_get(); if (last_update_time) { update_ts_time_stats(cpu, ts, now, last_update_time); iowait = ts->iowait_sleeptime; } else { if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { ktime_t delta = ktime_sub(now, ts->idle_entrytime); iowait = ktime_add(ts->iowait_sleeptime, delta); } else { iowait = ts->iowait_sleeptime; } } return ktime_to_us(iowait); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); hrtimer_set_expires(&ts->sched_timer, ts->last_tick); /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); } else { tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } /* * Reset to make sure next tick stop doesn't get fooled by past * cached clock deadline. */ ts->next_tick = 0; } static inline bool local_timer_softirq_pending(void) { return local_softirq_pending() & BIT(TIMER_SOFTIRQ); } static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; unsigned long basejiff; unsigned int seq; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqcount_begin(&jiffies_seq); basemono = last_jiffies_update; basejiff = jiffies; } while (read_seqcount_retry(&jiffies_seq, seq)); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work * requests it. * Aside of that check whether the local timer softirq is * pending. If so its a bad idea to call get_next_timer_interrupt() * because there is an already expired timer, so it will request * immediate expiry, which rearms the hardware timer with a * minimal delta which brings us back to this place * immediately. Lather, rinse and repeat... */ if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* * Get the next pending timer. If high resolution * timers are enabled this only takes the timer wheel * timers into account. If high resolution timers are * disabled this also looks at the next expiring * hrtimer. */ next_tmr = get_next_timer_interrupt(basejiff, basemono); ts->next_timer = next_tmr; /* Take the next rcu event into account */ next_tick = next_rcu < next_tmr ? next_rcu : next_tmr; } /* * If the tick is due in the next period, keep it ticking or * force prod the timer. */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { /* * Tell the timer code that the base is not idle, i.e. undo * the effect of get_next_timer_interrupt(): */ timer_clear_idle(); /* * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. */ if (!ts->tick_stopped) { ts->timer_expires = 0; goto out; } } /* * If this CPU is the one which had the do_timer() duty last, we limit * the sleep time to the timekeeping max_deferment value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); if (cpu != tick_do_timer_cpu && (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last)) delta = KTIME_MAX; /* Calculate the next expiry time */ if (delta < (KTIME_MAX - basemono)) expires = basemono + delta; else expires = KTIME_MAX; ts->timer_expires = min_t(u64, expires, next_tick); out: return ts->timer_expires; } static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 basemono = ts->timer_expires_base; u64 expires = ts->timer_expires; ktime_t tick = expires; /* Make sure we won't be trying to stop it twice in a row. */ ts->timer_expires_base = 0; /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here the jiffies might be stale and * do_timer() never invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->do_timer_last = 1; } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { ts->do_timer_last = 0; } /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; WARN_ON_ONCE(1); printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, dev->next_event, hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); } /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when * interrupts arrive which do not cause a reschedule. In the * first call we save the current tick time, so we can restart * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { calc_load_nohz_start(); quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; trace_tick_stop(1, TICK_DEP_MASK_NONE); } ts->next_tick = tick; /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. */ if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); else tick_program_event(KTIME_MAX, 1); return; } if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED_HARD); } else { hrtimer_set_expires(&ts->sched_timer, tick); tick_program_event(tick, 1); } } static void tick_nohz_retain_tick(struct tick_sched *ts) { ts->timer_expires_base = 0; } #ifdef CONFIG_NO_HZ_FULL static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu) { if (tick_nohz_next_event(ts, cpu)) tick_nohz_stop_tick(ts, cpu); else tick_nohz_retain_tick(ts); } #endif /* CONFIG_NO_HZ_FULL */ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); /* * Clear the timer idle flag, so we avoid IPIs on remote queueing and * the clock forward checks in the enqueue path: */ timer_clear_idle(); calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); /* * Cancel the scheduled timer and restore the tick */ ts->tick_stopped = 0; tick_nohz_restart(ts, now); } static void __tick_nohz_full_update_tick(struct tick_sched *ts, ktime_t now) { #ifdef CONFIG_NO_HZ_FULL int cpu = smp_processor_id(); if (can_stop_full_tick(cpu, ts)) tick_nohz_stop_sched_tick(ts, cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, now); #endif } static void tick_nohz_full_update_tick(struct tick_sched *ts) { if (!tick_nohz_full_cpu(smp_processor_id())) return; if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; __tick_nohz_full_update_tick(ts, ktime_get()); } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { /* * If this CPU is offline and it is the one which updates * jiffies, then give up the assignment and let it be taken by * the CPU which runs the tick timer next. If we don't drop * this here the jiffies might be stale and do_timer() never * invoked. */ if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; /* * Make sure the CPU doesn't get fooled by obsolete tick * deadline if it comes back online later. */ ts->next_tick = 0; return false; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) return false; if (need_resched()) return false; if (unlikely(local_softirq_pending())) { static int ratelimit; if (ratelimit < 10 && !local_bh_blocked() && (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n", (unsigned int) local_softirq_pending()); ratelimit++; } return false; } if (tick_nohz_full_enabled()) { /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around */ if (tick_do_timer_cpu == cpu) return false; /* Should not happen for nohz-full */ if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) return false; } return true; } static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) { ktime_t expires; int cpu = smp_processor_id(); /* * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the * tick timer expiration time is known already. */ if (ts->timer_expires_base) expires = ts->timer_expires; else if (can_stop_idle_tick(cpu, ts)) expires = tick_nohz_next_event(ts, cpu); else return; ts->idle_calls++; if (expires > 0LL) { int was_stopped = ts->tick_stopped; tick_nohz_stop_tick(ts, cpu); ts->idle_sleeps++; ts->idle_expires = expires; if (!was_stopped && ts->tick_stopped) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } } else { tick_nohz_retain_tick(ts); } } /** * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick */ void tick_nohz_idle_stop_tick(void) { __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); } void tick_nohz_idle_retain_tick(void) { tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); /* * Undo the effect of get_next_timer_interrupt() called from * tick_nohz_next_event(). */ timer_clear_idle(); } /** * tick_nohz_idle_enter - prepare for entering idle on the current CPU * * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { struct tick_sched *ts; lockdep_assert_irqs_enabled(); local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 1; tick_nohz_start_idle(ts); local_irq_enable(); } /** * tick_nohz_irq_exit - update next tick event from interrupt exit * * When an interrupt fires while we are idle and it doesn't cause * a reschedule, it may still add, modify or delete a timer, enqueue * an RCU callback, etc... * So we need to re-calculate and reprogram the next tick event. */ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->inidle) tick_nohz_start_idle(ts); else tick_nohz_full_update_tick(ts); } /** * tick_nohz_idle_got_tick - Check whether or not the tick handler has run */ bool tick_nohz_idle_got_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->got_idle_tick) { ts->got_idle_tick = 0; return true; } return false; } /** * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer * or the tick, whatever that expires first. Note that, if the tick has been * stopped, it returns the next hrtimer. * * Called from power state control code with interrupts disabled */ ktime_t tick_nohz_get_next_hrtimer(void) { return __this_cpu_read(tick_cpu_device.evtdev)->next_event; } /** * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled. * * The return value of this function and/or the value returned by it through the * @delta_next pointer can be negative which must be taken into account by its * callers. */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); /* * The idle entry time is expected to be a sufficient approximation of * the current time at this point. */ ktime_t now = ts->idle_entrytime; ktime_t next_event; WARN_ON_ONCE(!ts->inidle); *delta_next = ktime_sub(dev->next_event, now); if (!can_stop_idle_tick(cpu, ts)) return *delta_next; next_event = tick_nohz_next_event(ts, cpu); if (!next_event) return *delta_next; /* * If the next highres timer to expire is earlier than next_event, the * idle governor needs to know that. */ next_event = min_t(u64, next_event, hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); } /** * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value * for a particular CPU. * * Called from the schedutil frequency scaling governor in scheduler context. */ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) { struct tick_sched *ts = tick_get_tick_sched(cpu); return ts->idle_calls; } /** * tick_nohz_get_idle_calls - return the current idle calls counter value * * Called from the schedutil frequency scaling governor in scheduler context. */ unsigned long tick_nohz_get_idle_calls(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return ts->idle_calls; } static void tick_nohz_account_idle_time(struct tick_sched *ts, ktime_t now) { unsigned long ticks; ts->idle_exittime = now; if (vtime_accounting_enabled_this_cpu()) return; /* * We stopped the tick in idle. Update process times would miss the * time we slept as update_process_times does only a 1 tick * accounting. Enforce that this is accounted to idle ! */ ticks = jiffies - ts->idle_jiffies; /* * We might be one off. Do not randomly account a huge number of ticks! */ if (ticks && ticks < LONG_MAX) account_idle_ticks(ticks); } void tick_nohz_idle_restart_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->tick_stopped) { ktime_t now = ktime_get(); tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } } static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) { if (tick_nohz_full_cpu(smp_processor_id())) __tick_nohz_full_update_tick(ts, now); else tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } /** * tick_nohz_idle_exit - restart the idle tick from the idle task * * Restart the idle tick when the CPU is woken up from idle * This also exit the RCU extended quiescent state. The CPU * can use RCU again after this function is called. */ void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); WARN_ON_ONCE(!ts->inidle); WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 0; idle_active = ts->idle_active; tick_stopped = ts->tick_stopped; if (idle_active || tick_stopped) now = ktime_get(); if (idle_active) tick_nohz_stop_idle(ts, now); if (tick_stopped) tick_nohz_idle_update_tick(ts, now); local_irq_enable(); } /* * The nohz low res interrupt handler */ static void tick_nohz_handler(struct clock_event_device *dev) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); dev->next_event = KTIME_MAX; tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs); if (unlikely(ts->tick_stopped)) { /* * The clockevent device is not reprogrammed, so change the * clock event device to ONESHOT_STOPPED to avoid spurious * interrupts on devices which might not be truly one shot. */ tick_program_event(KTIME_MAX, 1); return; } hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { if (!tick_nohz_enabled) return; ts->nohz_mode = mode; /* One update is enough */ if (!test_and_set_bit(0, &tick_nohz_active)) timers_update_nohz(); } /** * tick_nohz_switch_to_nohz - switch to nohz mode */ static void tick_nohz_switch_to_nohz(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t next; if (!tick_nohz_enabled) return; if (tick_switch_to_oneshot(tick_nohz_handler)) return; /* * Recycle the hrtimer in ts, so we can share the * hrtimer_forward with the highres code. */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); /* Get the next period */ next = tick_init_jiffy_update(); hrtimer_set_expires(&ts->sched_timer, next); hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } static inline void tick_nohz_irq_enter(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; if (!ts->idle_active && !ts->tick_stopped) return; now = ktime_get(); if (ts->idle_active) tick_nohz_stop_idle(ts, now); /* * If all CPUs are idle. We may need to update a stale jiffies value. * Note nohz_full is a special case: a timekeeper is guaranteed to stay * alive but it might be busy looping with interrupts disabled in some * rare case (typically stop machine). So we must make sure we have a * last resort. */ if (ts->tick_stopped) tick_nohz_update_jiffies(now); } #else static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } #endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter to notify about the possible interruption of idle() */ void tick_irq_enter(void) { tick_check_oneshot_broadcast_this_cpu(); tick_nohz_irq_enter(); } /* * High resolution timer specific code */ #ifdef CONFIG_HIGH_RES_TIMERS /* * We rearm the timer until we get disabled by the idle code. * Called with interrupts disabled. */ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) { struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); tick_sched_do_timer(ts, now); /* * Do not call, when we are not in irq context and have * no valid regs pointer */ if (regs) tick_sched_handle(ts, regs); else ts->next_tick = 0; /* No need to reprogram if we are in idle or full dynticks mode */ if (unlikely(ts->tick_stopped)) return HRTIMER_NORESTART; hrtimer_forward(timer, now, TICK_NSEC); return HRTIMER_RESTART; } static int sched_skew_tick; static int __init skew_tick(char *str) { get_option(&str, &sched_skew_tick); return 0; } early_param("skew_tick", skew_tick); /** * tick_setup_sched_timer - setup the tick emulation timer */ void tick_setup_sched_timer(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now = ktime_get(); /* * Emulate tick processing via per-CPU hrtimers: */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); ts->sched_timer.function = tick_sched_timer; /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); /* Offset the tick to avert jiffies_lock contention. */ if (sched_skew_tick) { u64 offset = TICK_NSEC >> 1; do_div(offset, num_possible_cpus()); offset *= smp_processor_id(); hrtimer_add_expires_ns(&ts->sched_timer, offset); } hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); } #endif /* HIGH_RES_TIMERS */ #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t idle_sleeptime, iowait_sleeptime; unsigned long idle_calls, idle_sleeps; # ifdef CONFIG_HIGH_RES_TIMERS if (ts->sched_timer.base) hrtimer_cancel(&ts->sched_timer); # endif idle_sleeptime = ts->idle_sleeptime; iowait_sleeptime = ts->iowait_sleeptime; idle_calls = ts->idle_calls; idle_sleeps = ts->idle_sleeps; memset(ts, 0, sizeof(*ts)); ts->idle_sleeptime = idle_sleeptime; ts->iowait_sleeptime = iowait_sleeptime; ts->idle_calls = idle_calls; ts->idle_sleeps = idle_sleeps; } #endif /** * Async notification about clocksource changes */ void tick_clock_notify(void) { int cpu; for_each_possible_cpu(cpu) set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); } /* * Async notification about clock event changes */ void tick_oneshot_notify(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); set_bit(0, &ts->check_clocks); } /** * Check, if a change happened, which makes oneshot possible. * * Called cyclic from the hrtimer softirq (driven by the timer * softirq) allow_nohz signals, that we can switch into low-res nohz * mode, because high resolution timers are disabled (either compile * or runtime). Called with interrupts disabled. */ int tick_check_oneshot_change(int allow_nohz) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (!test_and_clear_bit(0, &ts->check_clocks)) return 0; if (ts->nohz_mode != NOHZ_MODE_INACTIVE) return 0; if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) return 0; if (!allow_nohz) return 1; tick_nohz_switch_to_nohz(); return 0; } |
2 9 1 1 1 12 12 12 11 11 11 11 2 9 12 12 9 12 10 1 1 7 12 3 93 93 5 1 6 11 7 2 4 6 10 1 9 9 4 4 2 4 15 7 6 6 15 15 15 14 1 3 3 3 2 1 8 1 1 1 2 10 9 2 8 3 8 8 3 8 8 7 7 5 1 3 2 5 19 19 19 93 93 20 93 89 88 89 88 89 89 89 88 89 89 93 84 4 87 91 3 91 89 89 1 92 92 92 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 | // SPDX-License-Identifier: GPL-2.0-only /* * Input driver to ExplorerPS/2 device driver module. * * Copyright (c) 1999-2002 Vojtech Pavlik * Copyright (c) 2004 Dmitry Torokhov */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define MOUSEDEV_MINOR_BASE 32 #define MOUSEDEV_MINORS 31 #define MOUSEDEV_MIX 63 #include <linux/bitops.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/module.h> #include <linux/init.h> #include <linux/input.h> #include <linux/random.h> #include <linux/major.h> #include <linux/device.h> #include <linux/cdev.h> #include <linux/kernel.h> MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>"); MODULE_DESCRIPTION("Mouse (ExplorerPS/2) device interfaces"); MODULE_LICENSE("GPL"); #ifndef CONFIG_INPUT_MOUSEDEV_SCREEN_X #define CONFIG_INPUT_MOUSEDEV_SCREEN_X 1024 #endif #ifndef CONFIG_INPUT_MOUSEDEV_SCREEN_Y #define CONFIG_INPUT_MOUSEDEV_SCREEN_Y 768 #endif static int xres = CONFIG_INPUT_MOUSEDEV_SCREEN_X; module_param(xres, uint, 0644); MODULE_PARM_DESC(xres, "Horizontal screen resolution"); static int yres = CONFIG_INPUT_MOUSEDEV_SCREEN_Y; module_param(yres, uint, 0644); MODULE_PARM_DESC(yres, "Vertical screen resolution"); static unsigned tap_time = 200; module_param(tap_time, uint, 0644); MODULE_PARM_DESC(tap_time, "Tap time for touchpads in absolute mode (msecs)"); struct mousedev_hw_data { int dx, dy, dz; int x, y; int abs_event; unsigned long buttons; }; struct mousedev { int open; struct input_handle handle; wait_queue_head_t wait; struct list_head client_list; spinlock_t client_lock; /* protects client_list */ struct mutex mutex; struct device dev; struct cdev cdev; bool exist; struct list_head mixdev_node; bool opened_by_mixdev; struct mousedev_hw_data packet; unsigned int pkt_count; int old_x[4], old_y[4]; int frac_dx, frac_dy; unsigned long touch; int (*open_device)(struct mousedev *mousedev); void (*close_device)(struct mousedev *mousedev); }; enum mousedev_emul { MOUSEDEV_EMUL_PS2, MOUSEDEV_EMUL_IMPS, MOUSEDEV_EMUL_EXPS }; struct mousedev_motion { int dx, dy, dz; unsigned long buttons; }; #define PACKET_QUEUE_LEN 16 struct mousedev_client { struct fasync_struct *fasync; struct mousedev *mousedev; struct list_head node; struct mousedev_motion packets[PACKET_QUEUE_LEN]; unsigned int head, tail; spinlock_t packet_lock; int pos_x, pos_y; u8 ps2[6]; unsigned char ready, buffer, bufsiz; unsigned char imexseq, impsseq; enum mousedev_emul mode; unsigned long last_buttons; }; #define MOUSEDEV_SEQ_LEN 6 static unsigned char mousedev_imps_seq[] = { 0xf3, 200, 0xf3, 100, 0xf3, 80 }; static unsigned char mousedev_imex_seq[] = { 0xf3, 200, 0xf3, 200, 0xf3, 80 }; static struct mousedev *mousedev_mix; static LIST_HEAD(mousedev_mix_list); #define fx(i) (mousedev->old_x[(mousedev->pkt_count - (i)) & 03]) #define fy(i) (mousedev->old_y[(mousedev->pkt_count - (i)) & 03]) static void mousedev_touchpad_event(struct input_dev *dev, struct mousedev *mousedev, unsigned int code, int value) { int size, tmp; enum { FRACTION_DENOM = 128 }; switch (code) { case ABS_X: fx(0) = value; if (mousedev->touch && mousedev->pkt_count >= 2) { size = input_abs_get_max(dev, ABS_X) - input_abs_get_min(dev, ABS_X); if (size == 0) size = 256 * 2; tmp = ((value - fx(2)) * 256 * FRACTION_DENOM) / size; tmp += mousedev->frac_dx; mousedev->packet.dx = tmp / FRACTION_DENOM; mousedev->frac_dx = tmp - mousedev->packet.dx * FRACTION_DENOM; } break; case ABS_Y: fy(0) = value; if (mousedev->touch && mousedev->pkt_count >= 2) { /* use X size for ABS_Y to keep the same scale */ size = input_abs_get_max(dev, ABS_X) - input_abs_get_min(dev, ABS_X); if (size == 0) size = 256 * 2; tmp = -((value - fy(2)) * 256 * FRACTION_DENOM) / size; tmp += mousedev->frac_dy; mousedev->packet.dy = tmp / FRACTION_DENOM; mousedev->frac_dy = tmp - mousedev->packet.dy * FRACTION_DENOM; } break; } } static void mousedev_abs_event(struct input_dev *dev, struct mousedev *mousedev, unsigned int code, int value) { int min, max, size; switch (code) { case ABS_X: min = input_abs_get_min(dev, ABS_X); max = input_abs_get_max(dev, ABS_X); size = max - min; if (size == 0) size = xres ? : 1; value = clamp(value, min, max); mousedev->packet.x = ((value - min) * xres) / size; mousedev->packet.abs_event = 1; break; case ABS_Y: min = input_abs_get_min(dev, ABS_Y); max = input_abs_get_max(dev, ABS_Y); size = max - min; if (size == 0) size = yres ? : 1; value = clamp(value, min, max); mousedev->packet.y = yres - ((value - min) * yres) / size; mousedev->packet.abs_event = 1; break; } } static void mousedev_rel_event(struct mousedev *mousedev, unsigned int code, int value) { switch (code) { case REL_X: mousedev->packet.dx += value; break; case REL_Y: mousedev->packet.dy -= value; break; case REL_WHEEL: mousedev->packet.dz -= value; break; } } static void mousedev_key_event(struct mousedev *mousedev, unsigned int code, int value) { int index; switch (code) { case BTN_TOUCH: case BTN_0: case BTN_LEFT: index = 0; break; case BTN_STYLUS: case BTN_1: case BTN_RIGHT: index = 1; break; case BTN_2: case BTN_FORWARD: case BTN_STYLUS2: case BTN_MIDDLE: index = 2; break; case BTN_3: case BTN_BACK: case BTN_SIDE: index = 3; break; case BTN_4: case BTN_EXTRA: index = 4; break; default: return; } if (value) { set_bit(index, &mousedev->packet.buttons); set_bit(index, &mousedev_mix->packet.buttons); } else { clear_bit(index, &mousedev->packet.buttons); clear_bit(index, &mousedev_mix->packet.buttons); } } static void mousedev_notify_readers(struct mousedev *mousedev, struct mousedev_hw_data *packet) { struct mousedev_client *client; struct mousedev_motion *p; unsigned int new_head; int wake_readers = 0; rcu_read_lock(); list_for_each_entry_rcu(client, &mousedev->client_list, node) { /* Just acquire the lock, interrupts already disabled */ spin_lock(&client->packet_lock); p = &client->packets[client->head]; if (client->ready && p->buttons != mousedev->packet.buttons) { new_head = (client->head + 1) % PACKET_QUEUE_LEN; if (new_head != client->tail) { p = &client->packets[client->head = new_head]; memset(p, 0, sizeof(struct mousedev_motion)); } } if (packet->abs_event) { p->dx += packet->x - client->pos_x; p->dy += packet->y - client->pos_y; client->pos_x = packet->x; client->pos_y = packet->y; } client->pos_x += packet->dx; client->pos_x = clamp_val(client->pos_x, 0, xres); client->pos_y += packet->dy; client->pos_y = clamp_val(client->pos_y, 0, yres); p->dx += packet->dx; p->dy += packet->dy; p->dz += packet->dz; p->buttons = mousedev->packet.buttons; if (p->dx || p->dy || p->dz || p->buttons != client->last_buttons) client->ready = 1; spin_unlock(&client->packet_lock); if (client->ready) { kill_fasync(&client->fasync, SIGIO, POLL_IN); wake_readers = 1; } } rcu_read_unlock(); if (wake_readers) wake_up_interruptible(&mousedev->wait); } static void mousedev_touchpad_touch(struct mousedev *mousedev, int value) { if (!value) { if (mousedev->touch && time_before(jiffies, mousedev->touch + msecs_to_jiffies(tap_time))) { /* * Toggle left button to emulate tap. * We rely on the fact that mousedev_mix always has 0 * motion packet so we won't mess current position. */ set_bit(0, &mousedev->packet.buttons); set_bit(0, &mousedev_mix->packet.buttons); mousedev_notify_readers(mousedev, &mousedev_mix->packet); mousedev_notify_readers(mousedev_mix, &mousedev_mix->packet); clear_bit(0, &mousedev->packet.buttons); clear_bit(0, &mousedev_mix->packet.buttons); } mousedev->touch = mousedev->pkt_count = 0; mousedev->frac_dx = 0; mousedev->frac_dy = 0; } else if (!mousedev->touch) mousedev->touch = jiffies; } static void mousedev_event(struct input_handle *handle, unsigned int type, unsigned int code, int value) { struct mousedev *mousedev = handle->private; switch (type) { case EV_ABS: /* Ignore joysticks */ if (test_bit(BTN_TRIGGER, handle->dev->keybit)) return; if (test_bit(BTN_TOOL_FINGER, handle->dev->keybit)) mousedev_touchpad_event(handle->dev, mousedev, code, value); else mousedev_abs_event(handle->dev, mousedev, code, value); break; case EV_REL: mousedev_rel_event(mousedev, code, value); break; case EV_KEY: if (value != 2) { if (code == BTN_TOUCH && test_bit(BTN_TOOL_FINGER, handle->dev->keybit)) mousedev_touchpad_touch(mousedev, value); else mousedev_key_event(mousedev, code, value); } break; case EV_SYN: if (code == SYN_REPORT) { if (mousedev->touch) { mousedev->pkt_count++; /* * Input system eats duplicate events, * but we need all of them to do correct * averaging so apply present one forward */ fx(0) = fx(1); fy(0) = fy(1); } mousedev_notify_readers(mousedev, &mousedev->packet); mousedev_notify_readers(mousedev_mix, &mousedev->packet); mousedev->packet.dx = mousedev->packet.dy = mousedev->packet.dz = 0; mousedev->packet.abs_event = 0; } break; } } static int mousedev_fasync(int fd, struct file *file, int on) { struct mousedev_client *client = file->private_data; return fasync_helper(fd, file, on, &client->fasync); } static void mousedev_free(struct device *dev) { struct mousedev *mousedev = container_of(dev, struct mousedev, dev); input_put_device(mousedev->handle.dev); kfree(mousedev); } static int mousedev_open_device(struct mousedev *mousedev) { int retval; retval = mutex_lock_interruptible(&mousedev->mutex); if (retval) return retval; if (!mousedev->exist) retval = -ENODEV; else if (!mousedev->open++) { retval = input_open_device(&mousedev->handle); if (retval) mousedev->open--; } mutex_unlock(&mousedev->mutex); return retval; } static void mousedev_close_device(struct mousedev *mousedev) { mutex_lock(&mousedev->mutex); if (mousedev->exist && !--mousedev->open) input_close_device(&mousedev->handle); mutex_unlock(&mousedev->mutex); } /* * Open all available devices so they can all be multiplexed in one. * stream. Note that this function is called with mousedev_mix->mutex * held. */ static int mixdev_open_devices(struct mousedev *mixdev) { int error; error = mutex_lock_interruptible(&mixdev->mutex); if (error) return error; if (!mixdev->open++) { struct mousedev *mousedev; list_for_each_entry(mousedev, &mousedev_mix_list, mixdev_node) { if (!mousedev->opened_by_mixdev) { if (mousedev_open_device(mousedev)) continue; mousedev->opened_by_mixdev = true; } } } mutex_unlock(&mixdev->mutex); return 0; } /* * Close all devices that were opened as part of multiplexed * device. Note that this function is called with mousedev_mix->mutex * held. */ static void mixdev_close_devices(struct mousedev *mixdev) { mutex_lock(&mixdev->mutex); if (!--mixdev->open) { struct mousedev *mousedev; list_for_each_entry(mousedev, &mousedev_mix_list, mixdev_node) { if (mousedev->opened_by_mixdev) { mousedev->opened_by_mixdev = false; mousedev_close_device(mousedev); } } } mutex_unlock(&mixdev->mutex); } static void mousedev_attach_client(struct mousedev *mousedev, struct mousedev_client *client) { spin_lock(&mousedev->client_lock); list_add_tail_rcu(&client->node, &mousedev->client_list); spin_unlock(&mousedev->client_lock); } static void mousedev_detach_client(struct mousedev *mousedev, struct mousedev_client *client) { spin_lock(&mousedev->client_lock); list_del_rcu(&client->node); spin_unlock(&mousedev->client_lock); synchronize_rcu(); } static int mousedev_release(struct inode *inode, struct file *file) { struct mousedev_client *client = file->private_data; struct mousedev *mousedev = client->mousedev; mousedev_detach_client(mousedev, client); kfree(client); mousedev->close_device(mousedev); return 0; } static int mousedev_open(struct inode *inode, struct file *file) { struct mousedev_client *client; struct mousedev *mousedev; int error; #ifdef CONFIG_INPUT_MOUSEDEV_PSAUX if (imajor(inode) == MISC_MAJOR) mousedev = mousedev_mix; else #endif mousedev = container_of(inode->i_cdev, struct mousedev, cdev); client = kzalloc(sizeof(struct mousedev_client), GFP_KERNEL); if (!client) return -ENOMEM; spin_lock_init(&client->packet_lock); client->pos_x = xres / 2; client->pos_y = yres / 2; client->mousedev = mousedev; mousedev_attach_client(mousedev, client); error = mousedev->open_device(mousedev); if (error) goto err_free_client; file->private_data = client; stream_open(inode, file); return 0; err_free_client: mousedev_detach_client(mousedev, client); kfree(client); return error; } static void mousedev_packet(struct mousedev_client *client, u8 *ps2_data) { struct mousedev_motion *p = &client->packets[client->tail]; s8 dx, dy, dz; dx = clamp_val(p->dx, -127, 127); p->dx -= dx; dy = clamp_val(p->dy, -127, 127); p->dy -= dy; ps2_data[0] = BIT(3); ps2_data[0] |= ((dx & BIT(7)) >> 3) | ((dy & BIT(7)) >> 2); ps2_data[0] |= p->buttons & 0x07; ps2_data[1] = dx; ps2_data[2] = dy; switch (client->mode) { case MOUSEDEV_EMUL_EXPS: dz = clamp_val(p->dz, -7, 7); p->dz -= dz; ps2_data[3] = (dz & 0x0f) | ((p->buttons & 0x18) << 1); client->bufsiz = 4; break; case MOUSEDEV_EMUL_IMPS: dz = clamp_val(p->dz, -127, 127); p->dz -= dz; ps2_data[0] |= ((p->buttons & 0x10) >> 3) | ((p->buttons & 0x08) >> 1); ps2_data[3] = dz; client->bufsiz = 4; break; case MOUSEDEV_EMUL_PS2: default: p->dz = 0; ps2_data[0] |= ((p->buttons & 0x10) >> 3) | ((p->buttons & 0x08) >> 1); client->bufsiz = 3; break; } if (!p->dx && !p->dy && !p->dz) { if (client->tail == client->head) { client->ready = 0; client->last_buttons = p->buttons; } else client->tail = (client->tail + 1) % PACKET_QUEUE_LEN; } } static void mousedev_generate_response(struct mousedev_client *client, int command) { client->ps2[0] = 0xfa; /* ACK */ switch (command) { case 0xeb: /* Poll */ mousedev_packet(client, &client->ps2[1]); client->bufsiz++; /* account for leading ACK */ break; case 0xf2: /* Get ID */ switch (client->mode) { case MOUSEDEV_EMUL_PS2: client->ps2[1] = 0; break; case MOUSEDEV_EMUL_IMPS: client->ps2[1] = 3; break; case MOUSEDEV_EMUL_EXPS: client->ps2[1] = 4; break; } client->bufsiz = 2; break; case 0xe9: /* Get info */ client->ps2[1] = 0x60; client->ps2[2] = 3; client->ps2[3] = 200; client->bufsiz = 4; break; case 0xff: /* Reset */ client->impsseq = client->imexseq = 0; client->mode = MOUSEDEV_EMUL_PS2; client->ps2[1] = 0xaa; client->ps2[2] = 0x00; client->bufsiz = 3; break; default: client->bufsiz = 1; break; } client->buffer = client->bufsiz; } static ssize_t mousedev_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct mousedev_client *client = file->private_data; unsigned char c; unsigned int i; for (i = 0; i < count; i++) { if (get_user(c, buffer + i)) return -EFAULT; spin_lock_irq(&client->packet_lock); if (c == mousedev_imex_seq[client->imexseq]) { if (++client->imexseq == MOUSEDEV_SEQ_LEN) { client->imexseq = 0; client->mode = MOUSEDEV_EMUL_EXPS; } } else client->imexseq = 0; if (c == mousedev_imps_seq[client->impsseq]) { if (++client->impsseq == MOUSEDEV_SEQ_LEN) { client->impsseq = 0; client->mode = MOUSEDEV_EMUL_IMPS; } } else client->impsseq = 0; mousedev_generate_response(client, c); spin_unlock_irq(&client->packet_lock); cond_resched(); } kill_fasync(&client->fasync, SIGIO, POLL_IN); wake_up_interruptible(&client->mousedev->wait); return count; } static ssize_t mousedev_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct mousedev_client *client = file->private_data; struct mousedev *mousedev = client->mousedev; u8 data[sizeof(client->ps2)]; int retval = 0; if (!client->ready && !client->buffer && mousedev->exist && (file->f_flags & O_NONBLOCK)) return -EAGAIN; retval = wait_event_interruptible(mousedev->wait, !mousedev->exist || client->ready || client->buffer); if (retval) return retval; if (!mousedev->exist) return -ENODEV; spin_lock_irq(&client->packet_lock); if (!client->buffer && client->ready) { mousedev_packet(client, client->ps2); client->buffer = client->bufsiz; } if (count > client->buffer) count = client->buffer; memcpy(data, client->ps2 + client->bufsiz - client->buffer, count); client->buffer -= count; spin_unlock_irq(&client->packet_lock); if (copy_to_user(buffer, data, count)) return -EFAULT; return count; } /* No kernel lock - fine */ static __poll_t mousedev_poll(struct file *file, poll_table *wait) { struct mousedev_client *client = file->private_data; struct mousedev *mousedev = client->mousedev; __poll_t mask; poll_wait(file, &mousedev->wait, wait); mask = mousedev->exist ? EPOLLOUT | EPOLLWRNORM : EPOLLHUP | EPOLLERR; if (client->ready || client->buffer) mask |= EPOLLIN | EPOLLRDNORM; return mask; } static const struct file_operations mousedev_fops = { .owner = THIS_MODULE, .read = mousedev_read, .write = mousedev_write, .poll = mousedev_poll, .open = mousedev_open, .release = mousedev_release, .fasync = mousedev_fasync, .llseek = noop_llseek, }; /* * Mark device non-existent. This disables writes, ioctls and * prevents new users from opening the device. Already posted * blocking reads will stay, however new ones will fail. */ static void mousedev_mark_dead(struct mousedev *mousedev) { mutex_lock(&mousedev->mutex); mousedev->exist = false; mutex_unlock(&mousedev->mutex); } /* * Wake up users waiting for IO so they can disconnect from * dead device. */ static void mousedev_hangup(struct mousedev *mousedev) { struct mousedev_client *client; spin_lock(&mousedev->client_lock); list_for_each_entry(client, &mousedev->client_list, node) kill_fasync(&client->fasync, SIGIO, POLL_HUP); spin_unlock(&mousedev->client_lock); wake_up_interruptible(&mousedev->wait); } static void mousedev_cleanup(struct mousedev *mousedev) { struct input_handle *handle = &mousedev->handle; mousedev_mark_dead(mousedev); mousedev_hangup(mousedev); /* mousedev is marked dead so no one else accesses mousedev->open */ if (mousedev->open) input_close_device(handle); } static int mousedev_reserve_minor(bool mixdev) { int minor; if (mixdev) { minor = input_get_new_minor(MOUSEDEV_MIX, 1, false); if (minor < 0) pr_err("failed to reserve mixdev minor: %d\n", minor); } else { minor = input_get_new_minor(MOUSEDEV_MINOR_BASE, MOUSEDEV_MINORS, true); if (minor < 0) pr_err("failed to reserve new minor: %d\n", minor); } return minor; } static struct mousedev *mousedev_create(struct input_dev *dev, struct input_handler *handler, bool mixdev) { struct mousedev *mousedev; int minor; int error; minor = mousedev_reserve_minor(mixdev); if (minor < 0) { error = minor; goto err_out; } mousedev = kzalloc(sizeof(struct mousedev), GFP_KERNEL); if (!mousedev) { error = -ENOMEM; goto err_free_minor; } INIT_LIST_HEAD(&mousedev->client_list); INIT_LIST_HEAD(&mousedev->mixdev_node); spin_lock_init(&mousedev->client_lock); mutex_init(&mousedev->mutex); lockdep_set_subclass(&mousedev->mutex, mixdev ? SINGLE_DEPTH_NESTING : 0); init_waitqueue_head(&mousedev->wait); if (mixdev) { dev_set_name(&mousedev->dev, "mice"); mousedev->open_device = mixdev_open_devices; mousedev->close_device = mixdev_close_devices; } else { int dev_no = minor; /* Normalize device number if it falls into legacy range */ if (dev_no < MOUSEDEV_MINOR_BASE + MOUSEDEV_MINORS) dev_no -= MOUSEDEV_MINOR_BASE; dev_set_name(&mousedev->dev, "mouse%d", dev_no); mousedev->open_device = mousedev_open_device; mousedev->close_device = mousedev_close_device; } mousedev->exist = true; mousedev->handle.dev = input_get_device(dev); mousedev->handle.name = dev_name(&mousedev->dev); mousedev->handle.handler = handler; mousedev->handle.private = mousedev; mousedev->dev.class = &input_class; if (dev) mousedev->dev.parent = &dev->dev; mousedev->dev.devt = MKDEV(INPUT_MAJOR, minor); mousedev->dev.release = mousedev_free; device_initialize(&mousedev->dev); if (!mixdev) { error = input_register_handle(&mousedev->handle); if (error) goto err_free_mousedev; } cdev_init(&mousedev->cdev, &mousedev_fops); error = cdev_device_add(&mousedev->cdev, &mousedev->dev); if (error) goto err_cleanup_mousedev; return mousedev; err_cleanup_mousedev: mousedev_cleanup(mousedev); if (!mixdev) input_unregister_handle(&mousedev->handle); err_free_mousedev: put_device(&mousedev->dev); err_free_minor: input_free_minor(minor); err_out: return ERR_PTR(error); } static void mousedev_destroy(struct mousedev *mousedev) { cdev_device_del(&mousedev->cdev, &mousedev->dev); mousedev_cleanup(mousedev); input_free_minor(MINOR(mousedev->dev.devt)); if (mousedev != mousedev_mix) input_unregister_handle(&mousedev->handle); put_device(&mousedev->dev); } static int mixdev_add_device(struct mousedev *mousedev) { int retval; retval = mutex_lock_interruptible(&mousedev_mix->mutex); if (retval) return retval; if (mousedev_mix->open) { retval = mousedev_open_device(mousedev); if (retval) goto out; mousedev->opened_by_mixdev = true; } get_device(&mousedev->dev); list_add_tail(&mousedev->mixdev_node, &mousedev_mix_list); out: mutex_unlock(&mousedev_mix->mutex); return retval; } static void mixdev_remove_device(struct mousedev *mousedev) { mutex_lock(&mousedev_mix->mutex); if (mousedev->opened_by_mixdev) { mousedev->opened_by_mixdev = false; mousedev_close_device(mousedev); } list_del_init(&mousedev->mixdev_node); mutex_unlock(&mousedev_mix->mutex); put_device(&mousedev->dev); } static int mousedev_connect(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id) { struct mousedev *mousedev; int error; mousedev = mousedev_create(dev, handler, false); if (IS_ERR(mousedev)) return PTR_ERR(mousedev); error = mixdev_add_device(mousedev); if (error) { mousedev_destroy(mousedev); return error; } return 0; } static void mousedev_disconnect(struct input_handle *handle) { struct mousedev *mousedev = handle->private; mixdev_remove_device(mousedev); mousedev_destroy(mousedev); } static const struct input_device_id mousedev_ids[] = { { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT | INPUT_DEVICE_ID_MATCH_RELBIT, .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_REL) }, .keybit = { [BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) }, .relbit = { BIT_MASK(REL_X) | BIT_MASK(REL_Y) }, }, /* A mouse like device, at least one button, two relative axes */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_RELBIT, .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_REL) }, .relbit = { BIT_MASK(REL_WHEEL) }, }, /* A separate scrollwheel */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) }, .keybit = { [BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH) }, .absbit = { BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) }, }, /* A tablet like device, at least touch detection, two absolute axes */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) }, .keybit = { [BIT_WORD(BTN_TOOL_FINGER)] = BIT_MASK(BTN_TOOL_FINGER) }, .absbit = { BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) | BIT_MASK(ABS_PRESSURE) | BIT_MASK(ABS_TOOL_WIDTH) }, }, /* A touchpad */ { .flags = INPUT_DEVICE_ID_MATCH_EVBIT | INPUT_DEVICE_ID_MATCH_KEYBIT | INPUT_DEVICE_ID_MATCH_ABSBIT, .evbit = { BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) }, .keybit = { [BIT_WORD(BTN_LEFT)] = BIT_MASK(BTN_LEFT) }, .absbit = { BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) }, }, /* Mouse-like device with absolute X and Y but ordinary clicks, like hp ILO2 High Performance mouse */ { }, /* Terminating entry */ }; MODULE_DEVICE_TABLE(input, mousedev_ids); static struct input_handler mousedev_handler = { .event = mousedev_event, .connect = mousedev_connect, .disconnect = mousedev_disconnect, .legacy_minors = true, .minor = MOUSEDEV_MINOR_BASE, .name = "mousedev", .id_table = mousedev_ids, }; #ifdef CONFIG_INPUT_MOUSEDEV_PSAUX #include <linux/miscdevice.h> static struct miscdevice psaux_mouse = { .minor = PSMOUSE_MINOR, .name = "psaux", .fops = &mousedev_fops, }; static bool psaux_registered; static void __init mousedev_psaux_register(void) { int error; error = misc_register(&psaux_mouse); if (error) pr_warn("could not register psaux device, error: %d\n", error); else psaux_registered = true; } static void __exit mousedev_psaux_unregister(void) { if (psaux_registered) misc_deregister(&psaux_mouse); } #else static inline void mousedev_psaux_register(void) { } static inline void mousedev_psaux_unregister(void) { } #endif static int __init mousedev_init(void) { int error; mousedev_mix = mousedev_create(NULL, &mousedev_handler, true); if (IS_ERR(mousedev_mix)) return PTR_ERR(mousedev_mix); error = input_register_handler(&mousedev_handler); if (error) { mousedev_destroy(mousedev_mix); return error; } mousedev_psaux_register(); pr_info("PS/2 mouse device common for all mice\n"); return 0; } static void __exit mousedev_exit(void) { mousedev_psaux_unregister(); input_unregister_handler(&mousedev_handler); mousedev_destroy(mousedev_mix); } module_init(mousedev_init); module_exit(mousedev_exit); |
8 8 8 8 8 1 1 1 1 21 21 21 21 21 21 2 61 1 1 89 1 88 3 89 2 89 89 58 33 90 34 2 60 61 31 61 1 1 1 1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 12 13 12 13 11 13 13 1 1 1 1 1 9 4 1 1 4 4 4 4 4 4 4 89 89 4 4 9 9 9 1 9 1 8 21 20 21 20 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2017 Intel Deutschland GmbH */ #include <linux/kernel.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/slab.h> #include "rate.h" #include "ieee80211_i.h" #include "debugfs.h" struct rate_control_alg { struct list_head list; const struct rate_control_ops *ops; }; static LIST_HEAD(rate_ctrl_algs); static DEFINE_MUTEX(rate_ctrl_mutex); static char *ieee80211_default_rc_algo = CONFIG_MAC80211_RC_DEFAULT; module_param(ieee80211_default_rc_algo, charp, 0644); MODULE_PARM_DESC(ieee80211_default_rc_algo, "Default rate control algorithm for mac80211 to use"); void rate_control_rate_init(struct sta_info *sta) { struct ieee80211_local *local = sta->sdata->local; struct rate_control_ref *ref = sta->rate_ctrl; struct ieee80211_sta *ista = &sta->sta; void *priv_sta = sta->rate_ctrl_priv; struct ieee80211_supported_band *sband; struct ieee80211_chanctx_conf *chanctx_conf; ieee80211_sta_set_rx_nss(sta); if (!ref) return; rcu_read_lock(); chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); return; } sband = local->hw.wiphy->bands[chanctx_conf->def.chan->band]; /* TODO: check for minstrel_s1g ? */ if (sband->band == NL80211_BAND_S1GHZ) { ieee80211_s1g_sta_rate_init(sta); rcu_read_unlock(); return; } spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->rate_init(ref->priv, sband, &chanctx_conf->def, ista, priv_sta); spin_unlock_bh(&sta->rate_ctrl_lock); rcu_read_unlock(); set_sta_flag(sta, WLAN_STA_RATE_CONTROL); } void rate_control_tx_status(struct ieee80211_local *local, struct ieee80211_supported_band *sband, struct ieee80211_tx_status *st) { struct rate_control_ref *ref = local->rate_ctrl; struct sta_info *sta = container_of(st->sta, struct sta_info, sta); void *priv_sta = sta->rate_ctrl_priv; if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) return; spin_lock_bh(&sta->rate_ctrl_lock); if (ref->ops->tx_status_ext) ref->ops->tx_status_ext(ref->priv, sband, priv_sta, st); else if (st->skb) ref->ops->tx_status(ref->priv, sband, st->sta, priv_sta, st->skb); else WARN_ON_ONCE(1); spin_unlock_bh(&sta->rate_ctrl_lock); } void rate_control_rate_update(struct ieee80211_local *local, struct ieee80211_supported_band *sband, struct sta_info *sta, u32 changed) { struct rate_control_ref *ref = local->rate_ctrl; struct ieee80211_sta *ista = &sta->sta; void *priv_sta = sta->rate_ctrl_priv; struct ieee80211_chanctx_conf *chanctx_conf; if (ref && ref->ops->rate_update) { rcu_read_lock(); chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); return; } spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->rate_update(ref->priv, sband, &chanctx_conf->def, ista, priv_sta, changed); spin_unlock_bh(&sta->rate_ctrl_lock); rcu_read_unlock(); } drv_sta_rc_update(local, sta->sdata, &sta->sta, changed); } int ieee80211_rate_control_register(const struct rate_control_ops *ops) { struct rate_control_alg *alg; if (!ops->name) return -EINVAL; mutex_lock(&rate_ctrl_mutex); list_for_each_entry(alg, &rate_ctrl_algs, list) { if (!strcmp(alg->ops->name, ops->name)) { /* don't register an algorithm twice */ WARN_ON(1); mutex_unlock(&rate_ctrl_mutex); return -EALREADY; } } alg = kzalloc(sizeof(*alg), GFP_KERNEL); if (alg == NULL) { mutex_unlock(&rate_ctrl_mutex); return -ENOMEM; } alg->ops = ops; list_add_tail(&alg->list, &rate_ctrl_algs); mutex_unlock(&rate_ctrl_mutex); return 0; } EXPORT_SYMBOL(ieee80211_rate_control_register); void ieee80211_rate_control_unregister(const struct rate_control_ops *ops) { struct rate_control_alg *alg; mutex_lock(&rate_ctrl_mutex); list_for_each_entry(alg, &rate_ctrl_algs, list) { if (alg->ops == ops) { list_del(&alg->list); kfree(alg); break; } } mutex_unlock(&rate_ctrl_mutex); } EXPORT_SYMBOL(ieee80211_rate_control_unregister); static const struct rate_control_ops * ieee80211_try_rate_control_ops_get(const char *name) { struct rate_control_alg *alg; const struct rate_control_ops *ops = NULL; if (!name) return NULL; mutex_lock(&rate_ctrl_mutex); list_for_each_entry(alg, &rate_ctrl_algs, list) { if (!strcmp(alg->ops->name, name)) { ops = alg->ops; break; } } mutex_unlock(&rate_ctrl_mutex); return ops; } /* Get the rate control algorithm. */ static const struct rate_control_ops * ieee80211_rate_control_ops_get(const char *name) { const struct rate_control_ops *ops; const char *alg_name; kernel_param_lock(THIS_MODULE); if (!name) alg_name = ieee80211_default_rc_algo; else alg_name = name; ops = ieee80211_try_rate_control_ops_get(alg_name); if (!ops && name) /* try default if specific alg requested but not found */ ops = ieee80211_try_rate_control_ops_get(ieee80211_default_rc_algo); /* Note: check for > 0 is intentional to avoid clang warning */ if (!ops && (strlen(CONFIG_MAC80211_RC_DEFAULT) > 0)) /* try built-in one if specific alg requested but not found */ ops = ieee80211_try_rate_control_ops_get(CONFIG_MAC80211_RC_DEFAULT); kernel_param_unlock(THIS_MODULE); return ops; } #ifdef CONFIG_MAC80211_DEBUGFS static ssize_t rcname_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct rate_control_ref *ref = file->private_data; int len = strlen(ref->ops->name); return simple_read_from_buffer(userbuf, count, ppos, ref->ops->name, len); } const struct file_operations rcname_ops = { .read = rcname_read, .open = simple_open, .llseek = default_llseek, }; #endif static struct rate_control_ref * rate_control_alloc(const char *name, struct ieee80211_local *local) { struct rate_control_ref *ref; ref = kmalloc(sizeof(struct rate_control_ref), GFP_KERNEL); if (!ref) return NULL; ref->ops = ieee80211_rate_control_ops_get(name); if (!ref->ops) goto free; ref->priv = ref->ops->alloc(&local->hw); if (!ref->priv) goto free; return ref; free: kfree(ref); return NULL; } static void rate_control_free(struct ieee80211_local *local, struct rate_control_ref *ctrl_ref) { ctrl_ref->ops->free(ctrl_ref->priv); #ifdef CONFIG_MAC80211_DEBUGFS debugfs_remove_recursive(local->debugfs.rcdir); local->debugfs.rcdir = NULL; #endif kfree(ctrl_ref); } void ieee80211_check_rate_mask(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; u32 user_mask, basic_rates = sdata->vif.bss_conf.basic_rates; enum nl80211_band band; if (WARN_ON(!sdata->vif.bss_conf.chandef.chan)) return; band = sdata->vif.bss_conf.chandef.chan->band; if (band == NL80211_BAND_S1GHZ) { /* TODO */ return; } if (WARN_ON_ONCE(!basic_rates)) return; user_mask = sdata->rc_rateidx_mask[band]; sband = local->hw.wiphy->bands[band]; if (user_mask & basic_rates) return; sdata_dbg(sdata, "no overlap between basic rates (0x%x) and user mask (0x%x on band %d) - clearing the latter", basic_rates, user_mask, band); sdata->rc_rateidx_mask[band] = (1 << sband->n_bitrates) - 1; } static bool rc_no_data_or_no_ack_use_min(struct ieee80211_tx_rate_control *txrc) { struct sk_buff *skb = txrc->skb; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); return (info->flags & (IEEE80211_TX_CTL_NO_ACK | IEEE80211_TX_CTL_USE_MINRATE)) || !ieee80211_is_tx_data(skb); } static void rc_send_low_basicrate(struct ieee80211_tx_rate *rate, u32 basic_rates, struct ieee80211_supported_band *sband) { u8 i; if (sband->band == NL80211_BAND_S1GHZ) { /* TODO */ rate->flags |= IEEE80211_TX_RC_S1G_MCS; rate->idx = 0; return; } if (basic_rates == 0) return; /* assume basic rates unknown and accept rate */ if (rate->idx < 0) return; if (basic_rates & (1 << rate->idx)) return; /* selected rate is a basic rate */ for (i = rate->idx + 1; i <= sband->n_bitrates; i++) { if (basic_rates & (1 << i)) { rate->idx = i; return; } } /* could not find a basic rate; use original selection */ } static void __rate_control_send_low(struct ieee80211_hw *hw, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta, struct ieee80211_tx_info *info, u32 rate_mask) { int i; u32 rate_flags = ieee80211_chandef_rate_flags(&hw->conf.chandef); if (sband->band == NL80211_BAND_S1GHZ) { info->control.rates[0].flags |= IEEE80211_TX_RC_S1G_MCS; info->control.rates[0].idx = 0; return; } if ((sband->band == NL80211_BAND_2GHZ) && (info->flags & IEEE80211_TX_CTL_NO_CCK_RATE)) rate_flags |= IEEE80211_RATE_ERP_G; info->control.rates[0].idx = 0; for (i = 0; i < sband->n_bitrates; i++) { if (!(rate_mask & BIT(i))) continue; if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; if (!rate_supported(sta, sband->band, i)) continue; info->control.rates[0].idx = i; break; } WARN_ONCE(i == sband->n_bitrates, "no supported rates for sta %pM (0x%x, band %d) in rate_mask 0x%x with flags 0x%x\n", sta ? sta->addr : NULL, sta ? sta->supp_rates[sband->band] : -1, sband->band, rate_mask, rate_flags); info->control.rates[0].count = (info->flags & IEEE80211_TX_CTL_NO_ACK) ? 1 : hw->max_rate_tries; info->control.skip_table = 1; } static bool rate_control_send_low(struct ieee80211_sta *pubsta, struct ieee80211_tx_rate_control *txrc) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); struct ieee80211_supported_band *sband = txrc->sband; struct sta_info *sta; int mcast_rate; bool use_basicrate = false; if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); if (!pubsta && txrc->bss) { mcast_rate = txrc->bss_conf->mcast_rate[sband->band]; if (mcast_rate > 0) { info->control.rates[0].idx = mcast_rate - 1; return true; } use_basicrate = true; } else if (pubsta) { sta = container_of(pubsta, struct sta_info, sta); if (ieee80211_vif_is_mesh(&sta->sdata->vif)) use_basicrate = true; } if (use_basicrate) rc_send_low_basicrate(&info->control.rates[0], txrc->bss_conf->basic_rates, sband); return true; } return false; } static bool rate_idx_match_legacy_mask(s8 *rate_idx, int n_bitrates, u32 mask) { int j; /* See whether the selected rate or anything below it is allowed. */ for (j = *rate_idx; j >= 0; j--) { if (mask & (1 << j)) { /* Okay, found a suitable rate. Use it. */ *rate_idx = j; return true; } } /* Try to find a higher rate that would be allowed */ for (j = *rate_idx + 1; j < n_bitrates; j++) { if (mask & (1 << j)) { /* Okay, found a suitable rate. Use it. */ *rate_idx = j; return true; } } return false; } static bool rate_idx_match_mcs_mask(s8 *rate_idx, u8 *mcs_mask) { int i, j; int ridx, rbit; ridx = *rate_idx / 8; rbit = *rate_idx % 8; /* sanity check */ if (ridx < 0 || ridx >= IEEE80211_HT_MCS_MASK_LEN) return false; /* See whether the selected rate or anything below it is allowed. */ for (i = ridx; i >= 0; i--) { for (j = rbit; j >= 0; j--) if (mcs_mask[i] & BIT(j)) { *rate_idx = i * 8 + j; return true; } rbit = 7; } /* Try to find a higher rate that would be allowed */ ridx = (*rate_idx + 1) / 8; rbit = (*rate_idx + 1) % 8; for (i = ridx; i < IEEE80211_HT_MCS_MASK_LEN; i++) { for (j = rbit; j < 8; j++) if (mcs_mask[i] & BIT(j)) { *rate_idx = i * 8 + j; return true; } rbit = 0; } return false; } static bool rate_idx_match_vht_mcs_mask(s8 *rate_idx, u16 *vht_mask) { int i, j; int ridx, rbit; ridx = *rate_idx >> 4; rbit = *rate_idx & 0xf; if (ridx < 0 || ridx >= NL80211_VHT_NSS_MAX) return false; /* See whether the selected rate or anything below it is allowed. */ for (i = ridx; i >= 0; i--) { for (j = rbit; j >= 0; j--) { if (vht_mask[i] & BIT(j)) { *rate_idx = (i << 4) | j; return true; } } rbit = 15; } /* Try to find a higher rate that would be allowed */ ridx = (*rate_idx + 1) >> 4; rbit = (*rate_idx + 1) & 0xf; for (i = ridx; i < NL80211_VHT_NSS_MAX; i++) { for (j = rbit; j < 16; j++) { if (vht_mask[i] & BIT(j)) { *rate_idx = (i << 4) | j; return true; } } rbit = 0; } return false; } static void rate_idx_match_mask(s8 *rate_idx, u16 *rate_flags, struct ieee80211_supported_band *sband, enum nl80211_chan_width chan_width, u32 mask, u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN], u16 vht_mask[NL80211_VHT_NSS_MAX]) { if (*rate_flags & IEEE80211_TX_RC_VHT_MCS) { /* handle VHT rates */ if (rate_idx_match_vht_mcs_mask(rate_idx, vht_mask)) return; *rate_idx = 0; /* keep protection flags */ *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS | IEEE80211_TX_RC_USE_CTS_PROTECT | IEEE80211_TX_RC_USE_SHORT_PREAMBLE); *rate_flags |= IEEE80211_TX_RC_MCS; if (chan_width == NL80211_CHAN_WIDTH_40) *rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; if (rate_idx_match_mcs_mask(rate_idx, mcs_mask)) return; /* also try the legacy rates. */ *rate_flags &= ~(IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_40_MHZ_WIDTH); if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, mask)) return; } else if (*rate_flags & IEEE80211_TX_RC_MCS) { /* handle HT rates */ if (rate_idx_match_mcs_mask(rate_idx, mcs_mask)) return; /* also try the legacy rates. */ *rate_idx = 0; /* keep protection flags */ *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS | IEEE80211_TX_RC_USE_CTS_PROTECT | IEEE80211_TX_RC_USE_SHORT_PREAMBLE); if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, mask)) return; } else { /* handle legacy rates */ if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, mask)) return; /* if HT BSS, and we handle a data frame, also try HT rates */ switch (chan_width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: return; default: break; } *rate_idx = 0; /* keep protection flags */ *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS | IEEE80211_TX_RC_USE_CTS_PROTECT | IEEE80211_TX_RC_USE_SHORT_PREAMBLE); *rate_flags |= IEEE80211_TX_RC_MCS; if (chan_width == NL80211_CHAN_WIDTH_40) *rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; if (rate_idx_match_mcs_mask(rate_idx, mcs_mask)) return; } /* * Uh.. No suitable rate exists. This should not really happen with * sane TX rate mask configurations. However, should someone manage to * configure supported rates and TX rate mask in incompatible way, * allow the frame to be transmitted with whatever the rate control * selected. */ } static void rate_fixup_ratelist(struct ieee80211_vif *vif, struct ieee80211_supported_band *sband, struct ieee80211_tx_info *info, struct ieee80211_tx_rate *rates, int max_rates) { struct ieee80211_rate *rate; bool inval = false; int i; /* * Set up the RTS/CTS rate as the fastest basic rate * that is not faster than the data rate unless there * is no basic rate slower than the data rate, in which * case we pick the slowest basic rate * * XXX: Should this check all retry rates? */ if (!(rates[0].flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) { u32 basic_rates = vif->bss_conf.basic_rates; s8 baserate = basic_rates ? ffs(basic_rates) - 1 : 0; rate = &sband->bitrates[rates[0].idx]; for (i = 0; i < sband->n_bitrates; i++) { /* must be a basic rate */ if (!(basic_rates & BIT(i))) continue; /* must not be faster than the data rate */ if (sband->bitrates[i].bitrate > rate->bitrate) continue; /* maximum */ if (sband->bitrates[baserate].bitrate < sband->bitrates[i].bitrate) baserate = i; } info->control.rts_cts_rate_idx = baserate; } for (i = 0; i < max_rates; i++) { /* * make sure there's no valid rate following * an invalid one, just in case drivers don't * take the API seriously to stop at -1. */ if (inval) { rates[i].idx = -1; continue; } if (rates[i].idx < 0) { inval = true; continue; } /* * For now assume MCS is already set up correctly, this * needs to be fixed. */ if (rates[i].flags & IEEE80211_TX_RC_MCS) { WARN_ON(rates[i].idx > 76); if (!(rates[i].flags & IEEE80211_TX_RC_USE_RTS_CTS) && info->control.use_cts_prot) rates[i].flags |= IEEE80211_TX_RC_USE_CTS_PROTECT; continue; } if (rates[i].flags & IEEE80211_TX_RC_VHT_MCS) { WARN_ON(ieee80211_rate_get_vht_mcs(&rates[i]) > 9); continue; } /* set up RTS protection if desired */ if (info->control.use_rts) { rates[i].flags |= IEEE80211_TX_RC_USE_RTS_CTS; info->control.use_cts_prot = false; } /* RC is busted */ if (WARN_ON_ONCE(rates[i].idx >= sband->n_bitrates)) { rates[i].idx = -1; continue; } rate = &sband->bitrates[rates[i].idx]; /* set up short preamble */ if (info->control.short_preamble && rate->flags & IEEE80211_RATE_SHORT_PREAMBLE) rates[i].flags |= IEEE80211_TX_RC_USE_SHORT_PREAMBLE; /* set up G protection */ if (!(rates[i].flags & IEEE80211_TX_RC_USE_RTS_CTS) && info->control.use_cts_prot && rate->flags & IEEE80211_RATE_ERP_G) rates[i].flags |= IEEE80211_TX_RC_USE_CTS_PROTECT; } } static void rate_control_fill_sta_table(struct ieee80211_sta *sta, struct ieee80211_tx_info *info, struct ieee80211_tx_rate *rates, int max_rates) { struct ieee80211_sta_rates *ratetbl = NULL; int i; if (sta && !info->control.skip_table) ratetbl = rcu_dereference(sta->rates); /* Fill remaining rate slots with data from the sta rate table. */ max_rates = min_t(int, max_rates, IEEE80211_TX_RATE_TABLE_SIZE); for (i = 0; i < max_rates; i++) { if (i < ARRAY_SIZE(info->control.rates) && info->control.rates[i].idx >= 0 && info->control.rates[i].count) { if (rates != info->control.rates) rates[i] = info->control.rates[i]; } else if (ratetbl) { rates[i].idx = ratetbl->rate[i].idx; rates[i].flags = ratetbl->rate[i].flags; if (info->control.use_rts) rates[i].count = ratetbl->rate[i].count_rts; else if (info->control.use_cts_prot) rates[i].count = ratetbl->rate[i].count_cts; else rates[i].count = ratetbl->rate[i].count; } else { rates[i].idx = -1; rates[i].count = 0; } if (rates[i].idx < 0 || !rates[i].count) break; } } static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta, u32 *mask, u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN], u16 vht_mask[NL80211_VHT_NSS_MAX]) { u32 i, flags; *mask = sdata->rc_rateidx_mask[sband->band]; flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef); for (i = 0; i < sband->n_bitrates; i++) { if ((flags & sband->bitrates[i].flags) != flags) *mask &= ~BIT(i); } if (*mask == (1 << sband->n_bitrates) - 1 && !sdata->rc_has_mcs_mask[sband->band] && !sdata->rc_has_vht_mcs_mask[sband->band]) return false; if (sdata->rc_has_mcs_mask[sband->band]) memcpy(mcs_mask, sdata->rc_rateidx_mcs_mask[sband->band], IEEE80211_HT_MCS_MASK_LEN); else memset(mcs_mask, 0xff, IEEE80211_HT_MCS_MASK_LEN); if (sdata->rc_has_vht_mcs_mask[sband->band]) memcpy(vht_mask, sdata->rc_rateidx_vht_mcs_mask[sband->band], sizeof(u16) * NL80211_VHT_NSS_MAX); else memset(vht_mask, 0xff, sizeof(u16) * NL80211_VHT_NSS_MAX); if (sta) { __le16 sta_vht_cap; u16 sta_vht_mask[NL80211_VHT_NSS_MAX]; /* Filter out rates that the STA does not support */ *mask &= sta->supp_rates[sband->band]; for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i]; sta_vht_cap = sta->vht_cap.vht_mcs.rx_mcs_map; ieee80211_get_vht_mask_from_cap(sta_vht_cap, sta_vht_mask); for (i = 0; i < NL80211_VHT_NSS_MAX; i++) vht_mask[i] &= sta_vht_mask[i]; } return true; } static void rate_control_apply_mask_ratetbl(struct sta_info *sta, struct ieee80211_supported_band *sband, struct ieee80211_sta_rates *rates) { int i; u32 mask; u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]; u16 vht_mask[NL80211_VHT_NSS_MAX]; enum nl80211_chan_width chan_width; if (!rate_control_cap_mask(sta->sdata, sband, &sta->sta, &mask, mcs_mask, vht_mask)) return; chan_width = sta->sdata->vif.bss_conf.chandef.width; for (i = 0; i < IEEE80211_TX_RATE_TABLE_SIZE; i++) { if (rates->rate[i].idx < 0) break; rate_idx_match_mask(&rates->rate[i].idx, &rates->rate[i].flags, sband, chan_width, mask, mcs_mask, vht_mask); } } static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_supported_band *sband, struct ieee80211_tx_rate *rates, int max_rates) { enum nl80211_chan_width chan_width; u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]; u32 mask; u16 rate_flags, vht_mask[NL80211_VHT_NSS_MAX]; int i; /* * Try to enforce the rateidx mask the user wanted. skip this if the * default mask (allow all rates) is used to save some processing for * the common case. */ if (!rate_control_cap_mask(sdata, sband, sta, &mask, mcs_mask, vht_mask)) return; /* * Make sure the rate index selected for each TX rate is * included in the configured mask and change the rate indexes * if needed. */ chan_width = sdata->vif.bss_conf.chandef.width; for (i = 0; i < max_rates; i++) { /* Skip invalid rates */ if (rates[i].idx < 0) break; rate_flags = rates[i].flags; rate_idx_match_mask(&rates[i].idx, &rate_flags, sband, chan_width, mask, mcs_mask, vht_mask); rates[i].flags = rate_flags; } } void ieee80211_get_tx_rates(struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct sk_buff *skb, struct ieee80211_tx_rate *dest, int max_rates) { struct ieee80211_sub_if_data *sdata; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_supported_band *sband; rate_control_fill_sta_table(sta, info, dest, max_rates); if (!vif) return; sdata = vif_to_sdata(vif); sband = sdata->local->hw.wiphy->bands[info->band]; if (ieee80211_is_tx_data(skb)) rate_control_apply_mask(sdata, sta, sband, dest, max_rates); if (dest[0].idx < 0) __rate_control_send_low(&sdata->local->hw, sband, sta, info, sdata->rc_rateidx_mask[info->band]); if (sta) rate_fixup_ratelist(vif, sband, info, dest, max_rates); } EXPORT_SYMBOL(ieee80211_get_tx_rates); void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_tx_rate_control *txrc) { struct rate_control_ref *ref = sdata->local->rate_ctrl; void *priv_sta = NULL; struct ieee80211_sta *ista = NULL; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); int i; for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { info->control.rates[i].idx = -1; info->control.rates[i].flags = 0; info->control.rates[i].count = 0; } if (rate_control_send_low(sta ? &sta->sta : NULL, txrc)) return; if (ieee80211_hw_check(&sdata->local->hw, HAS_RATE_CONTROL)) return; if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) { ista = &sta->sta; priv_sta = sta->rate_ctrl_priv; } if (ista) { spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->get_rate(ref->priv, ista, priv_sta, txrc); spin_unlock_bh(&sta->rate_ctrl_lock); } else { rate_control_send_low(NULL, txrc); } if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_RC_TABLE)) return; ieee80211_get_tx_rates(&sdata->vif, ista, txrc->skb, info->control.rates, ARRAY_SIZE(info->control.rates)); } int rate_control_set_rates(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_sta_rates *rates) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_sta_rates *old; struct ieee80211_supported_band *sband; sband = ieee80211_get_sband(sta->sdata); if (!sband) return -EINVAL; rate_control_apply_mask_ratetbl(sta, sband, rates); /* * mac80211 guarantees that this function will not be called * concurrently, so the following RCU access is safe, even without * extra locking. This can not be checked easily, so we just set * the condition to true. */ old = rcu_dereference_protected(pubsta->rates, true); rcu_assign_pointer(pubsta->rates, rates); if (old) kfree_rcu(old, rcu_head); if (sta->uploaded) drv_sta_rate_tbl_update(hw_to_local(hw), sta->sdata, pubsta); ieee80211_sta_set_expected_throughput(pubsta, sta_get_expected_throughput(sta)); return 0; } EXPORT_SYMBOL(rate_control_set_rates); int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, const char *name) { struct rate_control_ref *ref; ASSERT_RTNL(); if (local->open_count) return -EBUSY; if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { if (WARN_ON(!local->ops->set_rts_threshold)) return -EINVAL; return 0; } ref = rate_control_alloc(name, local); if (!ref) { wiphy_warn(local->hw.wiphy, "Failed to select rate control algorithm\n"); return -ENOENT; } WARN_ON(local->rate_ctrl); local->rate_ctrl = ref; wiphy_debug(local->hw.wiphy, "Selected rate control algorithm '%s'\n", ref->ops->name); return 0; } void rate_control_deinitialize(struct ieee80211_local *local) { struct rate_control_ref *ref; ref = local->rate_ctrl; if (!ref) return; local->rate_ctrl = NULL; rate_control_free(local, ref); } |
17 14 17 21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 | /* * linux/fs/nls/nls_cp775.c * * Charset cp775 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0106, 0x00fc, 0x00e9, 0x0101, 0x00e4, 0x0123, 0x00e5, 0x0107, 0x0142, 0x0113, 0x0156, 0x0157, 0x012b, 0x0179, 0x00c4, 0x00c5, /* 0x90*/ 0x00c9, 0x00e6, 0x00c6, 0x014d, 0x00f6, 0x0122, 0x00a2, 0x015a, 0x015b, 0x00d6, 0x00dc, 0x00f8, 0x00a3, 0x00d8, 0x00d7, 0x00a4, /* 0xa0*/ 0x0100, 0x012a, 0x00f3, 0x017b, 0x017c, 0x017a, 0x201d, 0x00a6, 0x00a9, 0x00ae, 0x00ac, 0x00bd, 0x00bc, 0x0141, 0x00ab, 0x00bb, /* 0xb0*/ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x0104, 0x010c, 0x0118, 0x0116, 0x2563, 0x2551, 0x2557, 0x255d, 0x012e, 0x0160, 0x2510, /* 0xc0*/ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x0172, 0x016a, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x017d, /* 0xd0*/ 0x0105, 0x010d, 0x0119, 0x0117, 0x012f, 0x0161, 0x0173, 0x016b, 0x017e, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, /* 0xe0*/ 0x00d3, 0x00df, 0x014c, 0x0143, 0x00f5, 0x00d5, 0x00b5, 0x0144, 0x0136, 0x0137, 0x013b, 0x013c, 0x0146, 0x0112, 0x0145, 0x2019, /* 0xf0*/ 0x00ad, 0x00b1, 0x201c, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x201e, 0x00b0, 0x2219, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xff, 0x00, 0x96, 0x9c, 0x9f, 0x00, 0xa7, 0xf5, /* 0xa0-0xa7 */ 0x00, 0xa8, 0x00, 0xae, 0xaa, 0xf0, 0xa9, 0x00, /* 0xa8-0xaf */ 0xf8, 0xf1, 0xfd, 0xfc, 0x00, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */ 0x00, 0xfb, 0x00, 0xaf, 0xac, 0xab, 0xf3, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x00, /* 0xc0-0xc7 */ 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0xe0, 0x00, 0xe5, 0x99, 0x9e, /* 0xd0-0xd7 */ 0x9d, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x84, 0x86, 0x91, 0x00, /* 0xe0-0xe7 */ 0x00, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0xa2, 0x00, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */ 0x9b, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0xa0, 0x83, 0x00, 0x00, 0xb5, 0xd0, 0x80, 0x87, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xb6, 0xd1, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0xed, 0x89, 0x00, 0x00, 0xb8, 0xd3, /* 0x10-0x17 */ 0xb7, 0xd2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x95, 0x85, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0xa1, 0x8c, 0x00, 0x00, 0xbd, 0xd4, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, 0xe9, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0xea, 0xeb, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0xad, 0x88, 0xe3, 0xe7, 0xee, 0xec, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0xe2, 0x93, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8a, 0x8b, /* 0x50-0x57 */ 0x00, 0x00, 0x97, 0x98, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xbe, 0xd5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0xc7, 0xd7, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0xc6, 0xd6, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x8d, 0xa5, 0xa3, 0xa4, 0xcf, 0xd8, 0x00, /* 0x78-0x7f */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0xef, 0x00, 0x00, 0xf2, 0xa6, 0xf7, 0x00, /* 0x18-0x1f */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0xf9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ }; static const unsigned char page25[256] = { 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, NULL, page22, NULL, NULL, page25, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8b, 0x8b, 0x8c, 0xa5, 0x84, 0x86, /* 0x88-0x8f */ 0x82, 0x91, 0x91, 0x93, 0x94, 0x85, 0x96, 0x98, /* 0x90-0x97 */ 0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */ 0x83, 0x8c, 0xa2, 0xa4, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0x88, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xd0, 0xd1, 0xd2, /* 0xb0-0xb7 */ 0xd3, 0xb9, 0xba, 0xbb, 0xbc, 0xd4, 0xd5, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xd6, 0xd7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xd8, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xa2, 0xe1, 0x93, 0xe7, 0xe4, 0xe4, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe9, 0xe9, 0xeb, 0xeb, 0xec, 0x89, 0xec, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x9a, 0x90, 0xa0, 0x8e, 0x95, 0x8f, 0x80, /* 0x80-0x87 */ 0xad, 0xed, 0x8a, 0x8a, 0xa1, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x92, 0x92, 0xe2, 0x99, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x97, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xe0, 0xa3, 0xa3, 0x8d, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xb5, 0xb6, 0xb7, 0xb8, 0xbd, 0xbe, 0xc6, 0xc7, /* 0xd0-0xd7 */ 0xcf, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0xe3, /* 0xe0-0xe7 */ 0xe8, 0xe8, 0xea, 0xea, 0xee, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp775", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp775(void) { return register_nls(&table); } static void __exit exit_nls_cp775(void) { unregister_nls(&table); } module_init(init_nls_cp775) module_exit(exit_nls_cp775) MODULE_LICENSE("Dual BSD/GPL"); |
13112 40 163 28 541 407 163 12490 488 12500 14860 1548 12490 12751 12754 186 186 14852 12791 12717 12715 14842 14838 14885 5124 8715 1 12893 12344 722 706 14313 13589 65 13432 13432 13434 1311 1519 39 12499 12504 935 12490 23 23 12534 81 136 7871 4 4 4 8175 12500 12494 12489 161 12501 20 508 285 739 1 19 304 54 11373 10 226 620 1493 1496 1476 25 4 1453 518 515 3 101 1448 3 1190 461 1494 1495 1496 51 1492 87 1015 250 521 45 40 45 45 45 45 5 45 45 45 45 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Resizable, Scalable, Concurrent Hash Table * * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au> * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch> * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> * * Code partially derived from nft_hash * Rewritten with rehash code from br_multicast plus single list * pointer as suggested by Josh Triplett * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #ifndef _LINUX_RHASHTABLE_H #define _LINUX_RHASHTABLE_H #include <linux/err.h> #include <linux/errno.h> #include <linux/jhash.h> #include <linux/list_nulls.h> #include <linux/workqueue.h> #include <linux/rculist.h> #include <linux/bit_spinlock.h> #include <linux/rhashtable-types.h> /* * Objects in an rhashtable have an embedded struct rhash_head * which is linked into as hash chain from the hash table - or one * of two or more hash tables when the rhashtable is being resized. * The end of the chain is marked with a special nulls marks which has * the least significant bit set but otherwise stores the address of * the hash bucket. This allows us to be sure we've found the end * of the right list. * The value stored in the hash bucket has BIT(0) used as a lock bit. * This bit must be atomically set before any changes are made to * the chain. To avoid dereferencing this pointer without clearing * the bit first, we use an opaque 'struct rhash_lock_head *' for the * pointer stored in the bucket. This struct needs to be defined so * that rcu_dereference() works on it, but it has no content so a * cast is needed for it to be useful. This ensures it isn't * used by mistake with clearing the lock bit first. */ struct rhash_lock_head {}; /* Maximum chain length before rehash * * The maximum (not average) chain length grows with the size of the hash * table, at a rate of (log N)/(log log N). * * The value of 16 is selected so that even if the hash table grew to * 2^32 you would not expect the maximum chain length to exceed it * unless we are under attack (or extremely unlucky). * * As this limit is only to detect attacks, we don't need to set it to a * lower value as you'd need the chain length to vastly exceed 16 to have * any real effect on the system. */ #define RHT_ELASTICITY 16u /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets * @nest: Number of bits of first-level nested table. * @rehash: Current bucket being rehashed * @hash_rnd: Random seed to fold into hash * @walkers: List of active walkers * @rcu: RCU structure for freeing the table * @future_tbl: Table under construction during rehashing * @ntbl: Nested table used when out of memory. * @buckets: size * hash buckets */ struct bucket_table { unsigned int size; unsigned int nest; u32 hash_rnd; struct list_head walkers; struct rcu_head rcu; struct bucket_table __rcu *future_tbl; struct lockdep_map dep_map; struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; /* * NULLS_MARKER() expects a hash value with the low * bits mostly likely to be significant, and it discards * the msb. * We give it an address, in which the bottom bit is * always 0, and the msb might be significant. * So we shift the address down one bit to align with * expectations and avoid losing a significant bit. * * We never store the NULLS_MARKER in the hash table * itself as we need the lsb for locking. * Instead we store a NULL */ #define RHT_NULLS_MARKER(ptr) \ ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1)) #define INIT_RHT_NULLS_HEAD(ptr) \ ((ptr) = NULL) static inline bool rht_is_a_nulls(const struct rhash_head *ptr) { return ((unsigned long) ptr & 1); } static inline void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) { return (char *)he - ht->p.head_offset; } static inline unsigned int rht_bucket_index(const struct bucket_table *tbl, unsigned int hash) { return hash & (tbl->size - 1); } static inline unsigned int rht_key_get_hash(struct rhashtable *ht, const void *key, const struct rhashtable_params params, unsigned int hash_rnd) { unsigned int hash; /* params must be equal to ht->p if it isn't constant. */ if (!__builtin_constant_p(params.key_len)) hash = ht->p.hashfn(key, ht->key_len, hash_rnd); else if (params.key_len) { unsigned int key_len = params.key_len; if (params.hashfn) hash = params.hashfn(key, key_len, hash_rnd); else if (key_len & (sizeof(u32) - 1)) hash = jhash(key, key_len, hash_rnd); else hash = jhash2(key, key_len / sizeof(u32), hash_rnd); } else { unsigned int key_len = ht->p.key_len; if (params.hashfn) hash = params.hashfn(key, key_len, hash_rnd); else hash = jhash(key, key_len, hash_rnd); } return hash; } static inline unsigned int rht_key_hashfn( struct rhashtable *ht, const struct bucket_table *tbl, const void *key, const struct rhashtable_params params) { unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd); return rht_bucket_index(tbl, hash); } static inline unsigned int rht_head_hashfn( struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he, const struct rhashtable_params params) { const char *ptr = rht_obj(ht, he); return likely(params.obj_hashfn) ? rht_bucket_index(tbl, params.obj_hashfn(ptr, params.key_len ?: ht->p.key_len, tbl->hash_rnd)) : rht_key_hashfn(ht, tbl, ptr + params.key_offset, params); } /** * rht_grow_above_75 - returns true if nelems > 0.75 * table-size * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_75(const struct rhashtable *ht, const struct bucket_table *tbl) { /* Expand table when exceeding 75% load */ return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) && (!ht->p.max_size || tbl->size < ht->p.max_size); } /** * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size * @ht: hash table * @tbl: current table */ static inline bool rht_shrink_below_30(const struct rhashtable *ht, const struct bucket_table *tbl) { /* Shrink table beneath 30% load */ return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) && tbl->size > ht->p.min_size; } /** * rht_grow_above_100 - returns true if nelems > table-size * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_100(const struct rhashtable *ht, const struct bucket_table *tbl) { return atomic_read(&ht->nelems) > tbl->size && (!ht->p.max_size || tbl->size < ht->p.max_size); } /** * rht_grow_above_max - returns true if table is above maximum * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_max(const struct rhashtable *ht, const struct bucket_table *tbl) { return atomic_read(&ht->nelems) >= ht->max_elems; } #ifdef CONFIG_PROVE_LOCKING int lockdep_rht_mutex_is_held(struct rhashtable *ht); int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash); #else static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht) { return 1; } static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) { return 1; } #endif /* CONFIG_PROVE_LOCKING */ void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj); void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter); void rhashtable_walk_exit(struct rhashtable_iter *iter); int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU); static inline void rhashtable_walk_start(struct rhashtable_iter *iter) { (void)rhashtable_walk_start_check(iter); } void *rhashtable_walk_next(struct rhashtable_iter *iter); void *rhashtable_walk_peek(struct rhashtable_iter *iter); void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU); void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), void *arg); void rhashtable_destroy(struct rhashtable *ht); struct rhash_lock_head __rcu **rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash); struct rhash_lock_head __rcu **__rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash); struct rhash_lock_head __rcu **rht_bucket_nested_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash); #define rht_dereference(p, ht) \ rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) #define rht_dereference_rcu(p, ht) \ rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht)) #define rht_dereference_bucket(p, tbl, hash) \ rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash)) #define rht_dereference_bucket_rcu(p, tbl, hash) \ rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash)) #define rht_entry(tpos, pos, member) \ ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) static inline struct rhash_lock_head __rcu *const *rht_bucket( const struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } static inline struct rhash_lock_head __rcu **rht_bucket_var( struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } static inline struct rhash_lock_head __rcu **rht_bucket_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) : &tbl->buckets[hash]; } /* * We lock a bucket by setting BIT(0) in the pointer - this is always * zero in real pointers. The NULLS mark is never stored in the bucket, * rather we store NULL if the bucket is empty. * bit_spin_locks do not handle contention well, but the whole point * of the hashtable design is to achieve minimum per-bucket contention. * A nested hash table might not have a bucket pointer. In that case * we cannot get a lock. For remove and replace the bucket cannot be * interesting and doesn't need locking. * For insert we allocate the bucket if this is the last bucket_table, * and then take the lock. * Sometimes we unlock a bucket by writing a new pointer there. In that * case we don't need to unlock, but we do need to reset state such as * local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer() * provides the same release semantics that bit_spin_unlock() provides, * this is safe. * When we write to a bucket without unlocking, we use rht_assign_locked(). */ static inline void rht_lock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt) { local_bh_disable(); bit_spin_lock(0, (unsigned long *)bkt); lock_map_acquire(&tbl->dep_map); } static inline void rht_lock_nested(struct bucket_table *tbl, struct rhash_lock_head __rcu **bucket, unsigned int subclass) { local_bh_disable(); bit_spin_lock(0, (unsigned long *)bucket); lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_); } static inline void rht_unlock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt) { lock_map_release(&tbl->dep_map); bit_spin_unlock(0, (unsigned long *)bkt); local_bh_enable(); } static inline struct rhash_head *__rht_ptr( struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt) { return (struct rhash_head *) ((unsigned long)p & ~BIT(0) ?: (unsigned long)RHT_NULLS_MARKER(bkt)); } /* * Where 'bkt' is a bucket and might be locked: * rht_ptr_rcu() dereferences that pointer and clears the lock bit. * rht_ptr() dereferences in a context where the bucket is locked. * rht_ptr_exclusive() dereferences in a context where exclusive * access is guaranteed, such as when destroying the table. */ static inline struct rhash_head *rht_ptr_rcu( struct rhash_lock_head __rcu *const *bkt) { return __rht_ptr(rcu_dereference(*bkt), bkt); } static inline struct rhash_head *rht_ptr( struct rhash_lock_head __rcu *const *bkt, struct bucket_table *tbl, unsigned int hash) { return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); } static inline struct rhash_head *rht_ptr_exclusive( struct rhash_lock_head __rcu *const *bkt) { return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt); } static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, struct rhash_head *obj) { if (rht_is_a_nulls(obj)) obj = NULL; rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0))); } static inline void rht_assign_unlock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt, struct rhash_head *obj) { if (rht_is_a_nulls(obj)) obj = NULL; lock_map_release(&tbl->dep_map); rcu_assign_pointer(*bkt, (void *)obj); preempt_enable(); __release(bitlock); local_bh_enable(); } /** * rht_for_each_from - iterate over hash chain from given head * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index */ #define rht_for_each_from(pos, head, tbl, hash) \ for (pos = head; \ !rht_is_a_nulls(pos); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) /** * rht_for_each - iterate over hash chain * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index */ #define rht_for_each(pos, tbl, hash) \ rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ tbl, hash) /** * rht_for_each_entry_from - iterate over hash chain from given head * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member) \ for (pos = head; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) /** * rht_for_each_entry - iterate over hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry(tpos, pos, tbl, hash, member) \ rht_for_each_entry_from(tpos, pos, \ rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ tbl, hash, member) /** * rht_for_each_entry_safe - safely iterate over hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @next: the &struct rhash_head to use as next in loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive allows for the looped code to * remove the loop cursor from the list. */ #define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = next, \ next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL) /** * rht_for_each_rcu_from - iterate over rcu hash chain from given head * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu_from(pos, head, tbl, hash) \ for (({barrier(); }), \ pos = head; \ !rht_is_a_nulls(pos); \ pos = rcu_dereference_raw(pos->next)) /** * rht_for_each_rcu - iterate over rcu hash chain * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu(pos, tbl, hash) \ for (({barrier(); }), \ pos = rht_ptr_rcu(rht_bucket(tbl, hash)); \ !rht_is_a_nulls(pos); \ pos = rcu_dereference_raw(pos->next)) /** * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \ for (({barrier(); }), \ pos = head; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket_rcu(pos->next, tbl, hash)) /** * rht_for_each_entry_rcu - iterate over rcu hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ rht_for_each_entry_rcu_from(tpos, pos, \ rht_ptr_rcu(rht_bucket(tbl, hash)), \ tbl, hash, member) /** * rhl_for_each_rcu - iterate over rcu hash table list * @pos: the &struct rlist_head to use as a loop cursor. * @list: the head of the list * * This hash chain list-traversal primitive should be used on the * list returned by rhltable_lookup. */ #define rhl_for_each_rcu(pos, list) \ for (pos = list; pos; pos = rcu_dereference_raw(pos->next)) /** * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rlist_head to use as a loop cursor. * @list: the head of the list * @member: name of the &struct rlist_head within the hashable struct. * * This hash chain list-traversal primitive should be used on the * list returned by rhltable_lookup. */ #define rhl_for_each_entry_rcu(tpos, pos, list, member) \ for (pos = list; pos && rht_entry(tpos, pos, member); \ pos = rcu_dereference_raw(pos->next)) static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, const void *obj) { struct rhashtable *ht = arg->ht; const char *ptr = obj; return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len); } /* Internal function, do not use. */ static inline struct rhash_head *__rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; struct rhash_lock_head __rcu *const *bkt; struct bucket_table *tbl; struct rhash_head *he; unsigned int hash; tbl = rht_dereference_rcu(ht->tbl, ht); restart: hash = rht_key_hashfn(ht, tbl, key, params); bkt = rht_bucket(tbl, hash); do { rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) { if (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, he)) : rhashtable_compare(&arg, rht_obj(ht, he))) continue; return he; } /* An object might have been moved to a different hash chain, * while we walk along it - better check and retry. */ } while (he != RHT_NULLS_MARKER(bkt)); /* Ensure we see any new tables. */ smp_rmb(); tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (unlikely(tbl)) goto restart; return NULL; } /** * rhashtable_lookup - search hash table * @ht: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for a entry with an identical key. The first matching entry is returned. * * This must only be called under the RCU read lock. * * Returns the first entry on which the compare function returned true. */ static inline void *rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { struct rhash_head *he = __rhashtable_lookup(ht, key, params); return he ? rht_obj(ht, he) : NULL; } /** * rhashtable_lookup_fast - search hash table, without RCU read lock * @ht: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for a entry with an identical key. The first matching entry is returned. * * Only use this function when you have other mechanisms guaranteeing * that the object won't go away after the RCU read lock is released. * * Returns the first entry on which the compare function returned true. */ static inline void *rhashtable_lookup_fast( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { void *obj; rcu_read_lock(); obj = rhashtable_lookup(ht, key, params); rcu_read_unlock(); return obj; } /** * rhltable_lookup - search hash list table * @hlt: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for a entry with an identical key. All matching entries are returned * in a list. * * This must only be called under the RCU read lock. * * Returns the list of entries that match the given key. */ static inline struct rhlist_head *rhltable_lookup( struct rhltable *hlt, const void *key, const struct rhashtable_params params) { struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params); return he ? container_of(he, struct rhlist_head, rhead) : NULL; } /* Internal function, please use rhashtable_insert_fast() instead. This * function returns the existing element already in hashes in there is a clash, * otherwise it returns an error via ERR_PTR(). */ static inline void *__rhashtable_insert_fast( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct bucket_table *tbl; struct rhash_head *head; unsigned int hash; int elasticity; void *data; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); hash = rht_head_hashfn(ht, tbl, obj, params); elasticity = RHT_ELASTICITY; bkt = rht_bucket_insert(ht, tbl, hash); data = ERR_PTR(-ENOMEM); if (!bkt) goto out; pprev = NULL; rht_lock(tbl, bkt); if (unlikely(rcu_access_pointer(tbl->future_tbl))) { slow_path: rht_unlock(tbl, bkt); rcu_read_unlock(); return rhashtable_insert_slow(ht, key, obj); } rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *plist; struct rhlist_head *list; elasticity--; if (!key || (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, head)) : rhashtable_compare(&arg, rht_obj(ht, head)))) { pprev = &head->next; continue; } data = rht_obj(ht, head); if (!rhlist) goto out_unlock; list = container_of(obj, struct rhlist_head, rhead); plist = container_of(head, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, plist); head = rht_dereference_bucket(head->next, tbl, hash); RCU_INIT_POINTER(list->rhead.next, head); if (pprev) { rcu_assign_pointer(*pprev, obj); rht_unlock(tbl, bkt); } else rht_assign_unlock(tbl, bkt, obj); data = NULL; goto out; } if (elasticity <= 0) goto slow_path; data = ERR_PTR(-E2BIG); if (unlikely(rht_grow_above_max(ht, tbl))) goto out_unlock; if (unlikely(rht_grow_above_100(ht, tbl))) goto slow_path; /* Inserting at head of list makes unlocking free. */ head = rht_ptr(bkt, tbl, hash); RCU_INIT_POINTER(obj->next, head); if (rhlist) { struct rhlist_head *list; list = container_of(obj, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, NULL); } atomic_inc(&ht->nelems); rht_assign_unlock(tbl, bkt, obj); if (rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); data = NULL; out: rcu_read_unlock(); return data; out_unlock: rht_unlock(tbl, bkt); goto out; } /** * rhashtable_insert_fast - insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhashtable_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { void *ret; ret = __rhashtable_insert_fast(ht, NULL, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhltable_insert_key - insert object into hash list table * @hlt: hash list table * @key: the pointer to the key * @list: pointer to hash list head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhltable_insert_key( struct rhltable *hlt, const void *key, struct rhlist_head *list, const struct rhashtable_params params) { return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead, params, true)); } /** * rhltable_insert - insert object into hash list table * @hlt: hash list table * @list: pointer to hash list head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhltable_insert( struct rhltable *hlt, struct rhlist_head *list, const struct rhashtable_params params) { const char *key = rht_obj(&hlt->ht, &list->rhead); key += params.key_offset; return rhltable_insert_key(hlt, key, list, params); } /** * rhashtable_lookup_insert_fast - lookup and insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * This lookup function may only be used for fixed key hash table (key_len * parameter set). It will BUG() if used inappropriately. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhashtable_lookup_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { const char *key = rht_obj(ht, obj); void *ret; BUG_ON(ht->p.obj_hashfn); ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Just like rhashtable_lookup_insert_fast(), but this function returns the * object if it exists, NULL if it did not and the insertion was successful, * and an ERR_PTR otherwise. */ static inline void *rhashtable_lookup_get_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { const char *key = rht_obj(ht, obj); BUG_ON(ht->p.obj_hashfn); return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, false); } /** * rhashtable_lookup_insert_key - search and insert object to hash table * with explicit key * @ht: hash table * @key: key * @obj: pointer to hash head inside object * @params: hash table parameters * * Lookups may occur in parallel with hashtable mutations and resizing. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. * * Returns zero on success. */ static inline int rhashtable_lookup_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { void *ret; BUG_ON(!ht->p.obj_hashfn || !key); ret = __rhashtable_insert_fast(ht, key, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhashtable_lookup_get_insert_key - lookup and insert object into hash table * @ht: hash table * @key: key * @obj: pointer to hash head inside object * @params: hash table parameters * * Just like rhashtable_lookup_insert_key(), but this function returns the * object if it exists, NULL if it does not and the insertion was successful, * and an ERR_PTR otherwise. */ static inline void *rhashtable_lookup_get_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { BUG_ON(!ht->p.obj_hashfn || !key); return __rhashtable_insert_fast(ht, key, obj, params, false); } /* Internal function, please use rhashtable_remove_fast() instead */ static inline int __rhashtable_remove_fast_one( struct rhashtable *ht, struct bucket_table *tbl, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; unsigned int hash; int err = -ENOENT; hash = rht_head_hashfn(ht, tbl, obj, params); bkt = rht_bucket_var(tbl, hash); if (!bkt) return -ENOENT; pprev = NULL; rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *list; list = container_of(he, struct rhlist_head, rhead); if (he != obj) { struct rhlist_head __rcu **lpprev; pprev = &he->next; if (!rhlist) continue; do { lpprev = &list->next; list = rht_dereference_bucket(list->next, tbl, hash); } while (list && obj != &list->rhead); if (!list) continue; list = rht_dereference_bucket(list->next, tbl, hash); RCU_INIT_POINTER(*lpprev, list); err = 0; break; } obj = rht_dereference_bucket(obj->next, tbl, hash); err = 1; if (rhlist) { list = rht_dereference_bucket(list->next, tbl, hash); if (list) { RCU_INIT_POINTER(list->rhead.next, obj); obj = &list->rhead; err = 0; } } if (pprev) { rcu_assign_pointer(*pprev, obj); rht_unlock(tbl, bkt); } else { rht_assign_unlock(tbl, bkt, obj); } goto unlocked; } rht_unlock(tbl, bkt); unlocked: if (err > 0) { atomic_dec(&ht->nelems); if (unlikely(ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))) schedule_work(&ht->run_work); err = 0; } return err; } /* Internal function, please use rhashtable_remove_fast() instead */ static inline int __rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct bucket_table *tbl; int err; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); /* Because we have already taken (and released) the bucket * lock in old_tbl, if we find that future_tbl is not yet * visible then that guarantees the entry to still be in * the old tbl if it exists. */ while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params, rhlist)) && (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) ; rcu_read_unlock(); return err; } /** * rhashtable_remove_fast - remove object from hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Since the hash chain is single linked, the removal operation needs to * walk the bucket chain upon removal. The removal operation is thus * considerable slow if the hash table is not correctly sized. * * Will automatically shrink the table if permitted when residency drops * below 30%. * * Returns zero on success, -ENOENT if the entry could not be found. */ static inline int rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { return __rhashtable_remove_fast(ht, obj, params, false); } /** * rhltable_remove - remove object from hash list table * @hlt: hash list table * @list: pointer to hash list head inside object * @params: hash table parameters * * Since the hash chain is single linked, the removal operation needs to * walk the bucket chain upon removal. The removal operation is thus * considerable slow if the hash table is not correctly sized. * * Will automatically shrink the table if permitted when residency drops * below 30% * * Returns zero on success, -ENOENT if the entry could not be found. */ static inline int rhltable_remove( struct rhltable *hlt, struct rhlist_head *list, const struct rhashtable_params params) { return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true); } /* Internal function, please use rhashtable_replace_fast() instead */ static inline int __rhashtable_replace_fast( struct rhashtable *ht, struct bucket_table *tbl, struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) { struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; unsigned int hash; int err = -ENOENT; /* Minimally, the old and new objects must have same hash * (which should mean identifiers are the same). */ hash = rht_head_hashfn(ht, tbl, obj_old, params); if (hash != rht_head_hashfn(ht, tbl, obj_new, params)) return -EINVAL; bkt = rht_bucket_var(tbl, hash); if (!bkt) return -ENOENT; pprev = NULL; rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { if (he != obj_old) { pprev = &he->next; continue; } rcu_assign_pointer(obj_new->next, obj_old->next); if (pprev) { rcu_assign_pointer(*pprev, obj_new); rht_unlock(tbl, bkt); } else { rht_assign_unlock(tbl, bkt, obj_new); } err = 0; goto unlocked; } rht_unlock(tbl, bkt); unlocked: return err; } /** * rhashtable_replace_fast - replace an object in hash table * @ht: hash table * @obj_old: pointer to hash head inside object being replaced * @obj_new: pointer to hash head inside object which is new * @params: hash table parameters * * Replacing an object doesn't affect the number of elements in the hash table * or bucket, so we don't need to worry about shrinking or expanding the * table here. * * Returns zero on success, -ENOENT if the entry could not be found, * -EINVAL if hash is not the same for the old and new objects. */ static inline int rhashtable_replace_fast( struct rhashtable *ht, struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) { struct bucket_table *tbl; int err; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); /* Because we have already taken (and released) the bucket * lock in old_tbl, if we find that future_tbl is not yet * visible then that guarantees the entry to still be in * the old tbl if it exists. */ while ((err = __rhashtable_replace_fast(ht, tbl, obj_old, obj_new, params)) && (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) ; rcu_read_unlock(); return err; } /** * rhltable_walk_enter - Initialise an iterator * @hlt: Table to walk over * @iter: Hash table Iterator * * This function prepares a hash table walk. * * Note that if you restart a walk after rhashtable_walk_stop you * may see the same object twice. Also, you may miss objects if * there are removals in between rhashtable_walk_stop and the next * call to rhashtable_walk_start. * * For a completely stable walk you should construct your own data * structure outside the hash table. * * This function may be called from any process context, including * non-preemptable context, but cannot be called from softirq or * hardirq context. * * You must call rhashtable_walk_exit after this function returns. */ static inline void rhltable_walk_enter(struct rhltable *hlt, struct rhashtable_iter *iter) { return rhashtable_walk_enter(&hlt->ht, iter); } /** * rhltable_free_and_destroy - free elements and destroy hash list table * @hlt: the hash list table to destroy * @free_fn: callback to release resources of element * @arg: pointer passed to free_fn * * See documentation for rhashtable_free_and_destroy. */ static inline void rhltable_free_and_destroy(struct rhltable *hlt, void (*free_fn)(void *ptr, void *arg), void *arg) { return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); } static inline void rhltable_destroy(struct rhltable *hlt) { return rhltable_free_and_destroy(hlt, NULL, NULL); } #endif /* _LINUX_RHASHTABLE_H */ |
238 94 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 | /* SPDX-License-Identifier: GPL-2.0 */ /* * workqueue.h --- work queue handling for Linux. */ #ifndef _LINUX_WORKQUEUE_H #define _LINUX_WORKQUEUE_H #include <linux/timer.h> #include <linux/linkage.h> #include <linux/bitops.h> #include <linux/lockdep.h> #include <linux/threads.h> #include <linux/atomic.h> #include <linux/cpumask.h> #include <linux/rcupdate.h> struct workqueue_struct; struct work_struct; typedef void (*work_func_t)(struct work_struct *work); void delayed_work_timer_fn(struct timer_list *t); /* * The first word is the work queue pointer and the flags rolled into * one */ #define work_data_bits(work) ((unsigned long *)(&(work)->data)) enum { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ WORK_STRUCT_INACTIVE_BIT= 1, /* work item is inactive */ WORK_STRUCT_PWQ_BIT = 2, /* data points to pwq */ WORK_STRUCT_LINKED_BIT = 3, /* next work is linked to this one */ #ifdef CONFIG_DEBUG_OBJECTS_WORK WORK_STRUCT_STATIC_BIT = 4, /* static initializer (debugobjects) */ WORK_STRUCT_COLOR_SHIFT = 5, /* color for workqueue flushing */ #else WORK_STRUCT_COLOR_SHIFT = 4, /* color for workqueue flushing */ #endif WORK_STRUCT_COLOR_BITS = 4, WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, WORK_STRUCT_INACTIVE = 1 << WORK_STRUCT_INACTIVE_BIT, WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT, WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, #ifdef CONFIG_DEBUG_OBJECTS_WORK WORK_STRUCT_STATIC = 1 << WORK_STRUCT_STATIC_BIT, #else WORK_STRUCT_STATIC = 0, #endif WORK_NR_COLORS = (1 << WORK_STRUCT_COLOR_BITS), /* not bound to any CPU, prefer the local CPU */ WORK_CPU_UNBOUND = NR_CPUS, /* * Reserve 8 bits off of pwq pointer w/ debugobjects turned off. * This makes pwqs aligned to 256 bytes and allows 16 workqueue * flush colors. */ WORK_STRUCT_FLAG_BITS = WORK_STRUCT_COLOR_SHIFT + WORK_STRUCT_COLOR_BITS, /* data contains off-queue information when !WORK_STRUCT_PWQ */ WORK_OFFQ_FLAG_BASE = WORK_STRUCT_COLOR_SHIFT, __WORK_OFFQ_CANCELING = WORK_OFFQ_FLAG_BASE, /* * When a work item is off queue, its high bits point to the last * pool it was on. Cap at 31 bits and use the highest number to * indicate that no pool is associated. */ WORK_OFFQ_FLAG_BITS = 1, WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_FLAG_BASE + WORK_OFFQ_FLAG_BITS, WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31, /* bit mask for work_busy() return values */ WORK_BUSY_PENDING = 1 << 0, WORK_BUSY_RUNNING = 1 << 1, /* maximum string length for set_worker_desc() */ WORKER_DESC_LEN = 24, }; /* Convenience constants - of type 'unsigned long', not 'enum'! */ #define WORK_OFFQ_CANCELING (1ul << __WORK_OFFQ_CANCELING) #define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) #define WORK_STRUCT_NO_POOL (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT) #define WORK_STRUCT_FLAG_MASK ((1ul << WORK_STRUCT_FLAG_BITS) - 1) #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK) struct work_struct { atomic_long_t data; struct list_head entry; work_func_t func; #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif }; #define WORK_DATA_INIT() ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL) #define WORK_DATA_STATIC_INIT() \ ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC)) struct delayed_work { struct work_struct work; struct timer_list timer; /* target workqueue and CPU ->timer uses to queue ->work */ struct workqueue_struct *wq; int cpu; }; struct rcu_work { struct work_struct work; struct rcu_head rcu; /* target workqueue ->rcu uses to queue ->work */ struct workqueue_struct *wq; }; /** * struct workqueue_attrs - A struct for workqueue attributes. * * This can be used to change attributes of an unbound workqueue. */ struct workqueue_attrs { /** * @nice: nice level */ int nice; /** * @cpumask: allowed CPUs */ cpumask_var_t cpumask; /** * @no_numa: disable NUMA affinity * * Unlike other fields, ``no_numa`` isn't a property of a worker_pool. It * only modifies how :c:func:`apply_workqueue_attrs` select pools and thus * doesn't participate in pool hash calculations or equality comparisons. */ bool no_numa; }; static inline struct delayed_work *to_delayed_work(struct work_struct *work) { return container_of(work, struct delayed_work, work); } static inline struct rcu_work *to_rcu_work(struct work_struct *work) { return container_of(work, struct rcu_work, work); } struct execute_work { struct work_struct work; }; #ifdef CONFIG_LOCKDEP /* * NB: because we have to copy the lockdep_map, setting _key * here is required, otherwise it could get initialised to the * copy of the lockdep_map! */ #define __WORK_INIT_LOCKDEP_MAP(n, k) \ .lockdep_map = STATIC_LOCKDEP_MAP_INIT(n, k), #else #define __WORK_INIT_LOCKDEP_MAP(n, k) #endif #define __WORK_INITIALIZER(n, f) { \ .data = WORK_DATA_STATIC_INIT(), \ .entry = { &(n).entry, &(n).entry }, \ .func = (f), \ __WORK_INIT_LOCKDEP_MAP(#n, &(n)) \ } #define __DELAYED_WORK_INITIALIZER(n, f, tflags) { \ .work = __WORK_INITIALIZER((n).work, (f)), \ .timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\ (tflags) | TIMER_IRQSAFE), \ } #define DECLARE_WORK(n, f) \ struct work_struct n = __WORK_INITIALIZER(n, f) #define DECLARE_DELAYED_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0) #define DECLARE_DEFERRABLE_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, TIMER_DEFERRABLE) #ifdef CONFIG_DEBUG_OBJECTS_WORK extern void __init_work(struct work_struct *work, int onstack); extern void destroy_work_on_stack(struct work_struct *work); extern void destroy_delayed_work_on_stack(struct delayed_work *work); static inline unsigned int work_static(struct work_struct *work) { return *work_data_bits(work) & WORK_STRUCT_STATIC; } #else static inline void __init_work(struct work_struct *work, int onstack) { } static inline void destroy_work_on_stack(struct work_struct *work) { } static inline void destroy_delayed_work_on_stack(struct delayed_work *work) { } static inline unsigned int work_static(struct work_struct *work) { return 0; } #endif /* * initialize all of a work item in one go * * NOTE! No point in using "atomic_long_set()": using a direct * assignment of the work data initializer allows the compiler * to generate better code. */ #ifdef CONFIG_LOCKDEP #define __INIT_WORK_KEY(_work, _func, _onstack, _key) \ do { \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, (_key), 0); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) #else #define __INIT_WORK_KEY(_work, _func, _onstack, _key) \ do { \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) #endif #define __INIT_WORK(_work, _func, _onstack) \ do { \ static __maybe_unused struct lock_class_key __key; \ \ __INIT_WORK_KEY(_work, _func, _onstack, &__key); \ } while (0) #define INIT_WORK(_work, _func) \ __INIT_WORK((_work), (_func), 0) #define INIT_WORK_ONSTACK(_work, _func) \ __INIT_WORK((_work), (_func), 1) #define INIT_WORK_ONSTACK_KEY(_work, _func, _key) \ __INIT_WORK_KEY((_work), (_func), 1, _key) #define __INIT_DELAYED_WORK(_work, _func, _tflags) \ do { \ INIT_WORK(&(_work)->work, (_func)); \ __init_timer(&(_work)->timer, \ delayed_work_timer_fn, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) #define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags) \ do { \ INIT_WORK_ONSTACK(&(_work)->work, (_func)); \ __init_timer_on_stack(&(_work)->timer, \ delayed_work_timer_fn, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) #define INIT_DELAYED_WORK(_work, _func) \ __INIT_DELAYED_WORK(_work, _func, 0) #define INIT_DELAYED_WORK_ONSTACK(_work, _func) \ __INIT_DELAYED_WORK_ONSTACK(_work, _func, 0) #define INIT_DEFERRABLE_WORK(_work, _func) \ __INIT_DELAYED_WORK(_work, _func, TIMER_DEFERRABLE) #define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func) \ __INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE) #define INIT_RCU_WORK(_work, _func) \ INIT_WORK(&(_work)->work, (_func)) #define INIT_RCU_WORK_ONSTACK(_work, _func) \ INIT_WORK_ONSTACK(&(_work)->work, (_func)) /** * work_pending - Find out whether a work item is currently pending * @work: The work item in question */ #define work_pending(work) \ test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) /** * delayed_work_pending - Find out whether a delayable work item is currently * pending * @w: The work item in question */ #define delayed_work_pending(w) \ work_pending(&(w)->work) /* * Workqueue flags and constants. For details, please refer to * Documentation/core-api/workqueue.rst. */ enum { WQ_UNBOUND = 1 << 1, /* not bound to any cpu */ WQ_FREEZABLE = 1 << 2, /* freeze during suspend */ WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu intensive workqueue */ WQ_SYSFS = 1 << 6, /* visible in sysfs, see workqueue_sysfs_register() */ /* * Per-cpu workqueues are generally preferred because they tend to * show better performance thanks to cache locality. Per-cpu * workqueues exclude the scheduler from choosing the CPU to * execute the worker threads, which has an unfortunate side effect * of increasing power consumption. * * The scheduler considers a CPU idle if it doesn't have any task * to execute and tries to keep idle cores idle to conserve power; * however, for example, a per-cpu work item scheduled from an * interrupt handler on an idle CPU will force the scheduler to * execute the work item on that CPU breaking the idleness, which in * turn may lead to more scheduling choices which are sub-optimal * in terms of power consumption. * * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default * but become unbound if workqueue.power_efficient kernel param is * specified. Per-cpu workqueues which are identified to * contribute significantly to power-consumption are identified and * marked with this flag and enabling the power_efficient mode * leads to noticeable power saving at the cost of small * performance disadvantage. * * http://thread.gmane.org/gmane.linux.kernel/1480396 */ WQ_POWER_EFFICIENT = 1 << 7, __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, }; /* unbound wq's aren't per-cpu, scale max_active according to #cpus */ #define WQ_UNBOUND_MAX_ACTIVE \ max_t(int, WQ_MAX_ACTIVE, num_possible_cpus() * WQ_MAX_UNBOUND_PER_CPU) /* * System-wide workqueues which are always present. * * system_wq is the one used by schedule[_delayed]_work[_on](). * Multi-CPU multi-threaded. There are users which expect relatively * short queue flush time. Don't queue works which can run for too * long. * * system_highpri_wq is similar to system_wq but for work items which * require WQ_HIGHPRI. * * system_long_wq is similar to system_wq but may host long running * works. Queue flushing might take relatively long. * * system_unbound_wq is unbound workqueue. Workers are not bound to * any specific CPU, not concurrency managed, and all queued works are * executed immediately as long as max_active limit is not reached and * resources are available. * * system_freezable_wq is equivalent to system_wq except that it's * freezable. * * *_power_efficient_wq are inclined towards saving power and converted * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise, * they are same as their non-power-efficient counterparts - e.g. * system_power_efficient_wq is identical to system_wq if * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. */ extern struct workqueue_struct *system_wq; extern struct workqueue_struct *system_highpri_wq; extern struct workqueue_struct *system_long_wq; extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; extern struct workqueue_struct *system_power_efficient_wq; extern struct workqueue_struct *system_freezable_power_efficient_wq; /** * alloc_workqueue - allocate a workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags * @max_active: max in-flight work items, 0 for default * remaining args: args for @fmt * * Allocate a workqueue with the specified parameters. For detailed * information on WQ_* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ __printf(1, 4) struct workqueue_struct * alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); /** * alloc_ordered_workqueue - allocate an ordered workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) * @args...: args for @fmt * * Allocate an ordered workqueue. An ordered workqueue executes at * most one work item at any given time in the queued order. They are * implemented as unbound workqueues with @max_active of one. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue(fmt, flags, args...) \ alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | \ __WQ_ORDERED_EXPLICIT | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) #define create_freezable_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \ WQ_MEM_RECLAIM, 1, (name)) #define create_singlethread_workqueue(name) \ alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name) extern void destroy_workqueue(struct workqueue_struct *wq); struct workqueue_attrs *alloc_workqueue_attrs(void); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); int workqueue_set_unbound_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_work_node(int node, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay); extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork); extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); extern bool flush_work(struct work_struct *work); extern bool cancel_work(struct work_struct *work); extern bool cancel_work_sync(struct work_struct *work); extern bool flush_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work_sync(struct delayed_work *dwork); extern bool flush_rcu_work(struct rcu_work *rwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); extern struct work_struct *current_work(void); extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); extern __printf(1, 2) void set_worker_desc(const char *fmt, ...); extern void print_worker_info(const char *log_lvl, struct task_struct *task); extern void show_all_workqueues(void); extern void show_one_workqueue(struct workqueue_struct *wq); extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task); /** * queue_work - queue work on a workqueue * @wq: workqueue to use * @work: work to queue * * Returns %false if @work was already on a queue, %true otherwise. * * We queue the work to the CPU on which it was submitted, but if the CPU dies * it can be processed by another CPU. * * Memory-ordering properties: If it returns %true, guarantees that all stores * preceding the call to queue_work() in the program order will be visible from * the CPU which will execute @work by the time such work executes, e.g., * * { x is initially 0 } * * CPU0 CPU1 * * WRITE_ONCE(x, 1); [ @work is being executed ] * r0 = queue_work(wq, work); r1 = READ_ONCE(x); * * Forbids: r0 == true && r1 == 0 */ static inline bool queue_work(struct workqueue_struct *wq, struct work_struct *work) { return queue_work_on(WORK_CPU_UNBOUND, wq, work); } /** * queue_delayed_work - queue work on a workqueue after delay * @wq: workqueue to use * @dwork: delayable work to queue * @delay: number of jiffies to wait before queueing * * Equivalent to queue_delayed_work_on() but tries to use the local CPU. */ static inline bool queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } /** * mod_delayed_work - modify delay of or queue a delayed work * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * mod_delayed_work_on() on local CPU. */ static inline bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } /** * schedule_work_on - put work task on a specific cpu * @cpu: cpu to put the work task on * @work: job to be done * * This puts a job on a specific cpu */ static inline bool schedule_work_on(int cpu, struct work_struct *work) { return queue_work_on(cpu, system_wq, work); } /** * schedule_work - put work task in global workqueue * @work: job to be done * * Returns %false if @work was already on the kernel-global workqueue and * %true otherwise. * * This puts a job in the kernel-global workqueue if it was not already * queued and leaves it in the same position on the kernel-global * workqueue otherwise. * * Shares the same memory-ordering properties of queue_work(), cf. the * DocBook header of queue_work(). */ static inline bool schedule_work(struct work_struct *work) { return queue_work(system_wq, work); } /** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its * completion. * * Think twice before calling this function! It's very easy to get into * trouble if you don't take great care. Either of the following situations * will lead to deadlock: * * One of the work items currently on the workqueue needs to acquire * a lock held by your code or its caller. * * Your code is running in the context of a work routine. * * They will be detected by lockdep when they occur, but the first might not * occur very often. It depends on what work items are on the workqueue and * what locks they need, which you have no control over. * * In most situations flushing the entire workqueue is overkill; you merely * need to know that a particular work item isn't queued and isn't running. * In such cases you should use cancel_delayed_work_sync() or * cancel_work_sync() instead. */ static inline void flush_scheduled_work(void) { flush_workqueue(system_wq); } /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done * @delay: number of jiffies to wait * * After waiting for a given time this puts a job in the kernel-global * workqueue on the specified CPU. */ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work_on(cpu, system_wq, dwork, delay); } /** * schedule_delayed_work - put work task in global workqueue after delay * @dwork: job to be done * @delay: number of jiffies to wait or 0 for immediate execution * * After waiting for a given time this puts a job in the kernel-global * workqueue. */ static inline bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work(system_wq, dwork, delay); } #ifndef CONFIG_SMP static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } #else long work_on_cpu_key(int cpu, long (*fn)(void *), void *arg, struct lock_class_key *key); /* * A new key is defined for each caller to make sure the work * associated with the function doesn't share its locking class. */ #define work_on_cpu(_cpu, _fn, _arg) \ ({ \ static struct lock_class_key __key; \ \ work_on_cpu_key(_cpu, _fn, _arg, &__key); \ }) long work_on_cpu_safe_key(int cpu, long (*fn)(void *), void *arg, struct lock_class_key *key); /* * A new key is defined for each caller to make sure the work * associated with the function doesn't share its locking class. */ #define work_on_cpu_safe(_cpu, _fn, _arg) \ ({ \ static struct lock_class_key __key; \ \ work_on_cpu_safe_key(_cpu, _fn, _arg, &__key); \ }) #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER extern void freeze_workqueues_begin(void); extern bool freeze_workqueues_busy(void); extern void thaw_workqueues(void); #endif /* CONFIG_FREEZER */ #ifdef CONFIG_SYSFS int workqueue_sysfs_register(struct workqueue_struct *wq); #else /* CONFIG_SYSFS */ static inline int workqueue_sysfs_register(struct workqueue_struct *wq) { return 0; } #endif /* CONFIG_SYSFS */ #ifdef CONFIG_WQ_WATCHDOG void wq_watchdog_touch(int cpu); #else /* CONFIG_WQ_WATCHDOG */ static inline void wq_watchdog_touch(int cpu) { } #endif /* CONFIG_WQ_WATCHDOG */ #ifdef CONFIG_SMP int workqueue_prepare_cpu(unsigned int cpu); int workqueue_online_cpu(unsigned int cpu); int workqueue_offline_cpu(unsigned int cpu); #endif void __init workqueue_init_early(void); void __init workqueue_init(void); #endif |
1 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 | // SPDX-License-Identifier: GPL-2.0-only /* * HID driver for ELO usb touchscreen 4000/4500 * * Copyright (c) 2013 Jiri Slaby * * Data parsing taken from elousb driver by Vojtech Pavlik. */ #include <linux/hid.h> #include <linux/input.h> #include <linux/module.h> #include <linux/usb.h> #include <linux/workqueue.h> #include "hid-ids.h" #define ELO_PERIODIC_READ_INTERVAL HZ #define ELO_SMARTSET_CMD_TIMEOUT 2000 /* msec */ /* Elo SmartSet commands */ #define ELO_FLUSH_SMARTSET_RESPONSES 0x02 /* Flush all pending smartset responses */ #define ELO_SEND_SMARTSET_COMMAND 0x05 /* Send a smartset command */ #define ELO_GET_SMARTSET_RESPONSE 0x06 /* Get a smartset response */ #define ELO_DIAG 0x64 /* Diagnostics command */ #define ELO_SMARTSET_PACKET_SIZE 8 struct elo_priv { struct usb_device *usbdev; struct delayed_work work; unsigned char buffer[ELO_SMARTSET_PACKET_SIZE]; }; static struct workqueue_struct *wq; static bool use_fw_quirk = true; module_param(use_fw_quirk, bool, S_IRUGO); MODULE_PARM_DESC(use_fw_quirk, "Do periodic pokes for broken M firmwares (default = true)"); static int elo_input_configured(struct hid_device *hdev, struct hid_input *hidinput) { struct input_dev *input = hidinput->input; /* * ELO devices have one Button usage in GenDesk field, which makes * hid-input map it to BTN_LEFT; that confuses userspace, which then * considers the device to be a mouse/touchpad instead of touchscreen. */ clear_bit(BTN_LEFT, input->keybit); set_bit(BTN_TOUCH, input->keybit); set_bit(ABS_PRESSURE, input->absbit); input_set_abs_params(input, ABS_PRESSURE, 0, 256, 0, 0); return 0; } static void elo_process_data(struct input_dev *input, const u8 *data, int size) { int press; input_report_abs(input, ABS_X, (data[3] << 8) | data[2]); input_report_abs(input, ABS_Y, (data[5] << 8) | data[4]); press = 0; if (data[1] & 0x80) press = (data[7] << 8) | data[6]; input_report_abs(input, ABS_PRESSURE, press); if (data[1] & 0x03) { input_report_key(input, BTN_TOUCH, 1); input_sync(input); } if (data[1] & 0x04) input_report_key(input, BTN_TOUCH, 0); input_sync(input); } static int elo_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct hid_input *hidinput; if (!(hdev->claimed & HID_CLAIMED_INPUT) || list_empty(&hdev->inputs)) return 0; hidinput = list_first_entry(&hdev->inputs, struct hid_input, list); switch (report->id) { case 0: if (data[0] == 'T') { /* Mandatory ELO packet marker */ elo_process_data(hidinput->input, data, size); return 1; } break; default: /* unknown report */ /* Unknown report type; pass upstream */ hid_info(hdev, "unknown report type %d\n", report->id); break; } return 0; } static int elo_smartset_send_get(struct usb_device *dev, u8 command, void *data) { unsigned int pipe; u8 dir; if (command == ELO_SEND_SMARTSET_COMMAND) { pipe = usb_sndctrlpipe(dev, 0); dir = USB_DIR_OUT; } else if (command == ELO_GET_SMARTSET_RESPONSE) { pipe = usb_rcvctrlpipe(dev, 0); dir = USB_DIR_IN; } else return -EINVAL; return usb_control_msg(dev, pipe, command, dir | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, 0, data, ELO_SMARTSET_PACKET_SIZE, ELO_SMARTSET_CMD_TIMEOUT); } static int elo_flush_smartset_responses(struct usb_device *dev) { return usb_control_msg(dev, usb_sndctrlpipe(dev, 0), ELO_FLUSH_SMARTSET_RESPONSES, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); } static void elo_work(struct work_struct *work) { struct elo_priv *priv = container_of(work, struct elo_priv, work.work); struct usb_device *dev = priv->usbdev; unsigned char *buffer = priv->buffer; int ret; ret = elo_flush_smartset_responses(dev); if (ret < 0) { dev_err(&dev->dev, "initial FLUSH_SMARTSET_RESPONSES failed, error %d\n", ret); goto fail; } /* send Diagnostics command */ *buffer = ELO_DIAG; ret = elo_smartset_send_get(dev, ELO_SEND_SMARTSET_COMMAND, buffer); if (ret < 0) { dev_err(&dev->dev, "send Diagnostics Command failed, error %d\n", ret); goto fail; } /* get the result */ ret = elo_smartset_send_get(dev, ELO_GET_SMARTSET_RESPONSE, buffer); if (ret < 0) { dev_err(&dev->dev, "get Diagnostics Command response failed, error %d\n", ret); goto fail; } /* read the ack */ if (*buffer != 'A') { ret = elo_smartset_send_get(dev, ELO_GET_SMARTSET_RESPONSE, buffer); if (ret < 0) { dev_err(&dev->dev, "get acknowledge response failed, error %d\n", ret); goto fail; } } fail: ret = elo_flush_smartset_responses(dev); if (ret < 0) dev_err(&dev->dev, "final FLUSH_SMARTSET_RESPONSES failed, error %d\n", ret); queue_delayed_work(wq, &priv->work, ELO_PERIODIC_READ_INTERVAL); } /* * Not all Elo devices need the periodic HID descriptor reads. * Only firmware version M needs this. */ static bool elo_broken_firmware(struct usb_device *dev) { struct usb_device *hub = dev->parent; struct usb_device *child = NULL; u16 fw_lvl = le16_to_cpu(dev->descriptor.bcdDevice); u16 child_vid, child_pid; int i; if (!use_fw_quirk) return false; if (fw_lvl != 0x10d) return false; /* iterate sibling devices of the touch controller */ usb_hub_for_each_child(hub, i, child) { child_vid = le16_to_cpu(child->descriptor.idVendor); child_pid = le16_to_cpu(child->descriptor.idProduct); /* * If one of the devices below is present attached as a sibling of * the touch controller then this is a newer IBM 4820 monitor that * does not need the IBM-requested workaround if fw level is * 0x010d - aka 'M'. * No other HW can have this combination. */ if (child_vid==0x04b3) { switch (child_pid) { case 0x4676: /* 4820 21x Video */ case 0x4677: /* 4820 51x Video */ case 0x4678: /* 4820 2Lx Video */ case 0x4679: /* 4820 5Lx Video */ return false; } } } return true; } static int elo_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct elo_priv *priv; int ret; if (!hid_is_usb(hdev)) return -EINVAL; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; INIT_DELAYED_WORK(&priv->work, elo_work); priv->usbdev = interface_to_usbdev(to_usb_interface(hdev->dev.parent)); hid_set_drvdata(hdev, priv); ret = hid_parse(hdev); if (ret) { hid_err(hdev, "parse failed\n"); goto err_free; } ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (ret) { hid_err(hdev, "hw start failed\n"); goto err_free; } if (elo_broken_firmware(priv->usbdev)) { hid_info(hdev, "broken firmware found, installing workaround\n"); queue_delayed_work(wq, &priv->work, ELO_PERIODIC_READ_INTERVAL); } return 0; err_free: kfree(priv); return ret; } static void elo_remove(struct hid_device *hdev) { struct elo_priv *priv = hid_get_drvdata(hdev); hid_hw_stop(hdev); cancel_delayed_work_sync(&priv->work); kfree(priv); } static const struct hid_device_id elo_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ELO, 0x0009), }, { HID_USB_DEVICE(USB_VENDOR_ID_ELO, 0x0030), }, { } }; MODULE_DEVICE_TABLE(hid, elo_devices); static struct hid_driver elo_driver = { .name = "elo", .id_table = elo_devices, .probe = elo_probe, .remove = elo_remove, .raw_event = elo_raw_event, .input_configured = elo_input_configured, }; static int __init elo_driver_init(void) { int ret; wq = create_singlethread_workqueue("elousb"); if (!wq) return -ENOMEM; ret = hid_register_driver(&elo_driver); if (ret) destroy_workqueue(wq); return ret; } module_init(elo_driver_init); static void __exit elo_driver_exit(void) { hid_unregister_driver(&elo_driver); destroy_workqueue(wq); } module_exit(elo_driver_exit); MODULE_AUTHOR("Jiri Slaby <jslaby@suse.cz>"); MODULE_LICENSE("GPL"); |
10 10 88 91 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Video for Linux Two * * A generic video device interface for the LINUX operating system * using a set of device structures/vectors for low level operations. * * This file replaces the videodev.c file that comes with the * regular kernel distribution. * * Author: Bill Dirks <bill@thedirks.org> * based on code by Alan Cox, <alan@cymru.net> */ /* * Video capture interface for Linux * * A generic video device interface for the LINUX operating system * using a set of device structures/vectors for low level operations. * * Author: Alan Cox, <alan@lxorguk.ukuu.org.uk> * * Fixes: */ /* * Video4linux 1/2 integration by Justin Schoeman * <justin@suntiger.ee.up.ac.za> * 2.4 PROCFS support ported from 2.4 kernels by * Iñaki GarcÃa Etxebarria <garetxe@euskalnet.net> * Makefile fix by "W. Michael Petullo" <mike@flyn.org> * 2.4 devfs support ported from 2.4 kernels by * Dan Merillat <dan@merillat.org> * Added Gerd Knorrs v4l1 enhancements (Justin Schoeman) */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/uaccess.h> #include <asm/io.h> #include <asm/div64.h> #include <media/v4l2-common.h> #include <media/v4l2-device.h> #include <media/v4l2-ctrls.h> #include <linux/videodev2.h> /* * * V 4 L 2 D R I V E R H E L P E R A P I * */ /* * Video Standard Operations (contributed by Michael Schimek) */ /* Helper functions for control handling */ /* Fill in a struct v4l2_queryctrl */ int v4l2_ctrl_query_fill(struct v4l2_queryctrl *qctrl, s32 _min, s32 _max, s32 _step, s32 _def) { const char *name; s64 min = _min; s64 max = _max; u64 step = _step; s64 def = _def; v4l2_ctrl_fill(qctrl->id, &name, &qctrl->type, &min, &max, &step, &def, &qctrl->flags); if (name == NULL) return -EINVAL; qctrl->minimum = min; qctrl->maximum = max; qctrl->step = step; qctrl->default_value = def; qctrl->reserved[0] = qctrl->reserved[1] = 0; strscpy(qctrl->name, name, sizeof(qctrl->name)); return 0; } EXPORT_SYMBOL(v4l2_ctrl_query_fill); /* Clamp x to be between min and max, aligned to a multiple of 2^align. min * and max don't have to be aligned, but there must be at least one valid * value. E.g., min=17,max=31,align=4 is not allowed as there are no multiples * of 16 between 17 and 31. */ static unsigned int clamp_align(unsigned int x, unsigned int min, unsigned int max, unsigned int align) { /* Bits that must be zero to be aligned */ unsigned int mask = ~((1 << align) - 1); /* Clamp to aligned min and max */ x = clamp(x, (min + ~mask) & mask, max & mask); /* Round to nearest aligned value */ if (align) x = (x + (1 << (align - 1))) & mask; return x; } static unsigned int clamp_roundup(unsigned int x, unsigned int min, unsigned int max, unsigned int alignment) { x = clamp(x, min, max); if (alignment) x = round_up(x, alignment); return x; } void v4l_bound_align_image(u32 *w, unsigned int wmin, unsigned int wmax, unsigned int walign, u32 *h, unsigned int hmin, unsigned int hmax, unsigned int halign, unsigned int salign) { *w = clamp_align(*w, wmin, wmax, walign); *h = clamp_align(*h, hmin, hmax, halign); /* Usually we don't need to align the size and are done now. */ if (!salign) return; /* How much alignment do we have? */ walign = __ffs(*w); halign = __ffs(*h); /* Enough to satisfy the image alignment? */ if (walign + halign < salign) { /* Max walign where there is still a valid width */ unsigned int wmaxa = __fls(wmax ^ (wmin - 1)); /* Max halign where there is still a valid height */ unsigned int hmaxa = __fls(hmax ^ (hmin - 1)); /* up the smaller alignment until we have enough */ do { if (halign >= hmaxa || (walign <= halign && walign < wmaxa)) { *w = clamp_align(*w, wmin, wmax, walign + 1); walign = __ffs(*w); } else { *h = clamp_align(*h, hmin, hmax, halign + 1); halign = __ffs(*h); } } while (halign + walign < salign); } } EXPORT_SYMBOL_GPL(v4l_bound_align_image); const void * __v4l2_find_nearest_size(const void *array, size_t array_size, size_t entry_size, size_t width_offset, size_t height_offset, s32 width, s32 height) { u32 error, min_error = U32_MAX; const void *best = NULL; unsigned int i; if (!array) return NULL; for (i = 0; i < array_size; i++, array += entry_size) { const u32 *entry_width = array + width_offset; const u32 *entry_height = array + height_offset; error = abs(*entry_width - width) + abs(*entry_height - height); if (error > min_error) continue; min_error = error; best = array; if (!error) break; } return best; } EXPORT_SYMBOL_GPL(__v4l2_find_nearest_size); int v4l2_g_parm_cap(struct video_device *vdev, struct v4l2_subdev *sd, struct v4l2_streamparm *a) { struct v4l2_subdev_frame_interval ival = { 0 }; int ret; if (a->type != V4L2_BUF_TYPE_VIDEO_CAPTURE && a->type != V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE) return -EINVAL; if (vdev->device_caps & V4L2_CAP_READWRITE) a->parm.capture.readbuffers = 2; if (v4l2_subdev_has_op(sd, video, g_frame_interval)) a->parm.capture.capability = V4L2_CAP_TIMEPERFRAME; ret = v4l2_subdev_call(sd, video, g_frame_interval, &ival); if (!ret) a->parm.capture.timeperframe = ival.interval; return ret; } EXPORT_SYMBOL_GPL(v4l2_g_parm_cap); int v4l2_s_parm_cap(struct video_device *vdev, struct v4l2_subdev *sd, struct v4l2_streamparm *a) { struct v4l2_subdev_frame_interval ival = { .interval = a->parm.capture.timeperframe }; int ret; if (a->type != V4L2_BUF_TYPE_VIDEO_CAPTURE && a->type != V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE) return -EINVAL; memset(&a->parm, 0, sizeof(a->parm)); if (vdev->device_caps & V4L2_CAP_READWRITE) a->parm.capture.readbuffers = 2; else a->parm.capture.readbuffers = 0; if (v4l2_subdev_has_op(sd, video, g_frame_interval)) a->parm.capture.capability = V4L2_CAP_TIMEPERFRAME; ret = v4l2_subdev_call(sd, video, s_frame_interval, &ival); if (!ret) a->parm.capture.timeperframe = ival.interval; return ret; } EXPORT_SYMBOL_GPL(v4l2_s_parm_cap); const struct v4l2_format_info *v4l2_format_info(u32 format) { static const struct v4l2_format_info formats[] = { /* RGB formats */ { .format = V4L2_PIX_FMT_BGR24, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 3, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_RGB24, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 3, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_HSV24, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 3, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_BGR32, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 4, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_XBGR32, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 4, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_BGRX32, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 4, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_RGB32, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 4, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_XRGB32, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 4, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_RGBX32, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 4, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_HSV32, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 4, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_ARGB32, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 4, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_RGBA32, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 4, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_ABGR32, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 4, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_BGRA32, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 4, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_RGB565, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_RGB555, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_BGR666, .pixel_enc = V4L2_PIXEL_ENC_RGB, .mem_planes = 1, .comp_planes = 1, .bpp = { 4, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, /* YUV packed formats */ { .format = V4L2_PIX_FMT_YUYV, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 2, .vdiv = 1 }, { .format = V4L2_PIX_FMT_YVYU, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 2, .vdiv = 1 }, { .format = V4L2_PIX_FMT_UYVY, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 2, .vdiv = 1 }, { .format = V4L2_PIX_FMT_VYUY, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 2, .vdiv = 1 }, /* YUV planar formats */ { .format = V4L2_PIX_FMT_NV12, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 2 }, { .format = V4L2_PIX_FMT_NV21, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 2 }, { .format = V4L2_PIX_FMT_NV16, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 1 }, { .format = V4L2_PIX_FMT_NV61, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 1 }, { .format = V4L2_PIX_FMT_NV24, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_NV42, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_YUV410, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 4, .vdiv = 4 }, { .format = V4L2_PIX_FMT_YVU410, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 4, .vdiv = 4 }, { .format = V4L2_PIX_FMT_YUV411P, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 4, .vdiv = 1 }, { .format = V4L2_PIX_FMT_YUV420, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 2, .vdiv = 2 }, { .format = V4L2_PIX_FMT_YVU420, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 2, .vdiv = 2 }, { .format = V4L2_PIX_FMT_YUV422P, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 2, .vdiv = 1 }, { .format = V4L2_PIX_FMT_GREY, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, /* YUV planar formats, non contiguous variant */ { .format = V4L2_PIX_FMT_YUV420M, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 3, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 2, .vdiv = 2 }, { .format = V4L2_PIX_FMT_YVU420M, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 3, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 2, .vdiv = 2 }, { .format = V4L2_PIX_FMT_YUV422M, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 3, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 2, .vdiv = 1 }, { .format = V4L2_PIX_FMT_YVU422M, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 3, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 2, .vdiv = 1 }, { .format = V4L2_PIX_FMT_YUV444M, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 3, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_YVU444M, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 3, .comp_planes = 3, .bpp = { 1, 1, 1, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_NV12M, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 2, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 2 }, { .format = V4L2_PIX_FMT_NV21M, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 2, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 2 }, { .format = V4L2_PIX_FMT_NV16M, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 2, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 1 }, { .format = V4L2_PIX_FMT_NV61M, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 2, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 1 }, /* Bayer RGB formats */ { .format = V4L2_PIX_FMT_SBGGR8, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SGBRG8, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SGRBG8, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SRGGB8, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SBGGR10, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SGBRG10, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SGRBG10, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SRGGB10, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SBGGR10ALAW8, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SGBRG10ALAW8, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SGRBG10ALAW8, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SRGGB10ALAW8, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SBGGR10DPCM8, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SGBRG10DPCM8, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SGRBG10DPCM8, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SRGGB10DPCM8, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SBGGR12, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SGBRG12, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SGRBG12, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SRGGB12, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, }; unsigned int i; for (i = 0; i < ARRAY_SIZE(formats); ++i) if (formats[i].format == format) return &formats[i]; return NULL; } EXPORT_SYMBOL(v4l2_format_info); static inline unsigned int v4l2_format_block_width(const struct v4l2_format_info *info, int plane) { if (!info->block_w[plane]) return 1; return info->block_w[plane]; } static inline unsigned int v4l2_format_block_height(const struct v4l2_format_info *info, int plane) { if (!info->block_h[plane]) return 1; return info->block_h[plane]; } void v4l2_apply_frmsize_constraints(u32 *width, u32 *height, const struct v4l2_frmsize_stepwise *frmsize) { if (!frmsize) return; /* * Clamp width/height to meet min/max constraints and round it up to * macroblock alignment. */ *width = clamp_roundup(*width, frmsize->min_width, frmsize->max_width, frmsize->step_width); *height = clamp_roundup(*height, frmsize->min_height, frmsize->max_height, frmsize->step_height); } EXPORT_SYMBOL_GPL(v4l2_apply_frmsize_constraints); int v4l2_fill_pixfmt_mp(struct v4l2_pix_format_mplane *pixfmt, u32 pixelformat, u32 width, u32 height) { const struct v4l2_format_info *info; struct v4l2_plane_pix_format *plane; int i; info = v4l2_format_info(pixelformat); if (!info) return -EINVAL; pixfmt->width = width; pixfmt->height = height; pixfmt->pixelformat = pixelformat; pixfmt->num_planes = info->mem_planes; if (info->mem_planes == 1) { plane = &pixfmt->plane_fmt[0]; plane->bytesperline = ALIGN(width, v4l2_format_block_width(info, 0)) * info->bpp[0]; plane->sizeimage = 0; for (i = 0; i < info->comp_planes; i++) { unsigned int hdiv = (i == 0) ? 1 : info->hdiv; unsigned int vdiv = (i == 0) ? 1 : info->vdiv; unsigned int aligned_width; unsigned int aligned_height; aligned_width = ALIGN(width, v4l2_format_block_width(info, i)); aligned_height = ALIGN(height, v4l2_format_block_height(info, i)); plane->sizeimage += info->bpp[i] * DIV_ROUND_UP(aligned_width, hdiv) * DIV_ROUND_UP(aligned_height, vdiv); } } else { for (i = 0; i < info->comp_planes; i++) { unsigned int hdiv = (i == 0) ? 1 : info->hdiv; unsigned int vdiv = (i == 0) ? 1 : info->vdiv; unsigned int aligned_width; unsigned int aligned_height; aligned_width = ALIGN(width, v4l2_format_block_width(info, i)); aligned_height = ALIGN(height, v4l2_format_block_height(info, i)); plane = &pixfmt->plane_fmt[i]; plane->bytesperline = info->bpp[i] * DIV_ROUND_UP(aligned_width, hdiv); plane->sizeimage = plane->bytesperline * DIV_ROUND_UP(aligned_height, vdiv); } } return 0; } EXPORT_SYMBOL_GPL(v4l2_fill_pixfmt_mp); int v4l2_fill_pixfmt(struct v4l2_pix_format *pixfmt, u32 pixelformat, u32 width, u32 height) { const struct v4l2_format_info *info; int i; info = v4l2_format_info(pixelformat); if (!info) return -EINVAL; /* Single planar API cannot be used for multi plane formats. */ if (info->mem_planes > 1) return -EINVAL; pixfmt->width = width; pixfmt->height = height; pixfmt->pixelformat = pixelformat; pixfmt->bytesperline = ALIGN(width, v4l2_format_block_width(info, 0)) * info->bpp[0]; pixfmt->sizeimage = 0; for (i = 0; i < info->comp_planes; i++) { unsigned int hdiv = (i == 0) ? 1 : info->hdiv; unsigned int vdiv = (i == 0) ? 1 : info->vdiv; unsigned int aligned_width; unsigned int aligned_height; aligned_width = ALIGN(width, v4l2_format_block_width(info, i)); aligned_height = ALIGN(height, v4l2_format_block_height(info, i)); pixfmt->sizeimage += info->bpp[i] * DIV_ROUND_UP(aligned_width, hdiv) * DIV_ROUND_UP(aligned_height, vdiv); } return 0; } EXPORT_SYMBOL_GPL(v4l2_fill_pixfmt); s64 v4l2_get_link_freq(struct v4l2_ctrl_handler *handler, unsigned int mul, unsigned int div) { struct v4l2_ctrl *ctrl; s64 freq; ctrl = v4l2_ctrl_find(handler, V4L2_CID_LINK_FREQ); if (ctrl) { struct v4l2_querymenu qm = { .id = V4L2_CID_LINK_FREQ }; int ret; qm.index = v4l2_ctrl_g_ctrl(ctrl); ret = v4l2_querymenu(handler, &qm); if (ret) return -ENOENT; freq = qm.value; } else { if (!mul || !div) return -ENOENT; ctrl = v4l2_ctrl_find(handler, V4L2_CID_PIXEL_RATE); if (!ctrl) return -ENOENT; freq = div_u64(v4l2_ctrl_g_ctrl_int64(ctrl) * mul, div); pr_warn("%s: Link frequency estimated using pixel rate: result might be inaccurate\n", __func__); pr_warn("%s: Consider implementing support for V4L2_CID_LINK_FREQ in the transmitter driver\n", __func__); } return freq > 0 ? freq : -EINVAL; } EXPORT_SYMBOL_GPL(v4l2_get_link_freq); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_FB_H #define _ASM_X86_FB_H #include <linux/fb.h> #include <linux/fs.h> #include <asm/page.h> static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma, unsigned long off) { unsigned long prot; prot = pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK; if (boot_cpu_data.x86 > 3) pgprot_val(vma->vm_page_prot) = prot | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS); } extern int fb_is_primary_device(struct fb_info *info); #endif /* _ASM_X86_FB_H */ |
9 9 6 2 30 5 3 14 8 2 16 16 16 6 16 9 5 8 8 6 19 8 10 4 10 10 9 10 6 4 9 9 9 8 5 1 1 4 4 16 9 1 1 2 1 10 5 6 16 16 16 16 13 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool.h> #include <linux/phy.h> #include "netlink.h" #include "common.h" struct strset_info { bool per_dev; bool free_strings; unsigned int count; const char (*strings)[ETH_GSTRING_LEN]; }; static const struct strset_info info_template[] = { [ETH_SS_TEST] = { .per_dev = true, }, [ETH_SS_STATS] = { .per_dev = true, }, [ETH_SS_PRIV_FLAGS] = { .per_dev = true, }, [ETH_SS_FEATURES] = { .per_dev = false, .count = ARRAY_SIZE(netdev_features_strings), .strings = netdev_features_strings, }, [ETH_SS_RSS_HASH_FUNCS] = { .per_dev = false, .count = ARRAY_SIZE(rss_hash_func_strings), .strings = rss_hash_func_strings, }, [ETH_SS_TUNABLES] = { .per_dev = false, .count = ARRAY_SIZE(tunable_strings), .strings = tunable_strings, }, [ETH_SS_PHY_STATS] = { .per_dev = true, }, [ETH_SS_PHY_TUNABLES] = { .per_dev = false, .count = ARRAY_SIZE(phy_tunable_strings), .strings = phy_tunable_strings, }, [ETH_SS_LINK_MODES] = { .per_dev = false, .count = __ETHTOOL_LINK_MODE_MASK_NBITS, .strings = link_mode_names, }, [ETH_SS_MSG_CLASSES] = { .per_dev = false, .count = NETIF_MSG_CLASS_COUNT, .strings = netif_msg_class_names, }, [ETH_SS_WOL_MODES] = { .per_dev = false, .count = WOL_MODE_COUNT, .strings = wol_mode_names, }, [ETH_SS_SOF_TIMESTAMPING] = { .per_dev = false, .count = __SOF_TIMESTAMPING_CNT, .strings = sof_timestamping_names, }, [ETH_SS_TS_TX_TYPES] = { .per_dev = false, .count = __HWTSTAMP_TX_CNT, .strings = ts_tx_type_names, }, [ETH_SS_TS_RX_FILTERS] = { .per_dev = false, .count = __HWTSTAMP_FILTER_CNT, .strings = ts_rx_filter_names, }, [ETH_SS_UDP_TUNNEL_TYPES] = { .per_dev = false, .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT, .strings = udp_tunnel_type_names, }, [ETH_SS_STATS_STD] = { .per_dev = false, .count = __ETHTOOL_STATS_CNT, .strings = stats_std_names, }, [ETH_SS_STATS_ETH_PHY] = { .per_dev = false, .count = __ETHTOOL_A_STATS_ETH_PHY_CNT, .strings = stats_eth_phy_names, }, [ETH_SS_STATS_ETH_MAC] = { .per_dev = false, .count = __ETHTOOL_A_STATS_ETH_MAC_CNT, .strings = stats_eth_mac_names, }, [ETH_SS_STATS_ETH_CTRL] = { .per_dev = false, .count = __ETHTOOL_A_STATS_ETH_CTRL_CNT, .strings = stats_eth_ctrl_names, }, [ETH_SS_STATS_RMON] = { .per_dev = false, .count = __ETHTOOL_A_STATS_RMON_CNT, .strings = stats_rmon_names, }, }; struct strset_req_info { struct ethnl_req_info base; u32 req_ids; bool counts_only; }; #define STRSET_REQINFO(__req_base) \ container_of(__req_base, struct strset_req_info, base) struct strset_reply_data { struct ethnl_reply_data base; struct strset_info sets[ETH_SS_COUNT]; }; #define STRSET_REPDATA(__reply_base) \ container_of(__reply_base, struct strset_reply_data, base) const struct nla_policy ethnl_strset_get_policy[] = { [ETHTOOL_A_STRSET_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_STRSET_STRINGSETS] = { .type = NLA_NESTED }, [ETHTOOL_A_STRSET_COUNTS_ONLY] = { .type = NLA_FLAG }, }; static const struct nla_policy get_stringset_policy[] = { [ETHTOOL_A_STRINGSET_ID] = { .type = NLA_U32 }, }; /** * strset_include() - test if a string set should be included in reply * @info: parsed client request * @data: pointer to request data structure * @id: id of string set to check (ETH_SS_* constants) */ static bool strset_include(const struct strset_req_info *info, const struct strset_reply_data *data, u32 id) { bool per_dev; BUILD_BUG_ON(ETH_SS_COUNT >= BITS_PER_BYTE * sizeof(info->req_ids)); if (info->req_ids) return info->req_ids & (1U << id); per_dev = data->sets[id].per_dev; if (!per_dev && !data->sets[id].strings) return false; return data->base.dev ? per_dev : !per_dev; } static int strset_get_id(const struct nlattr *nest, u32 *val, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(get_stringset_policy)]; int ret; ret = nla_parse_nested(tb, ARRAY_SIZE(get_stringset_policy) - 1, nest, get_stringset_policy, extack); if (ret < 0) return ret; if (!tb[ETHTOOL_A_STRINGSET_ID]) return -EINVAL; *val = nla_get_u32(tb[ETHTOOL_A_STRINGSET_ID]); return 0; } static const struct nla_policy strset_stringsets_policy[] = { [ETHTOOL_A_STRINGSETS_STRINGSET] = { .type = NLA_NESTED }, }; static int strset_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, struct netlink_ext_ack *extack) { struct strset_req_info *req_info = STRSET_REQINFO(req_base); struct nlattr *nest = tb[ETHTOOL_A_STRSET_STRINGSETS]; struct nlattr *attr; int rem, ret; if (!nest) return 0; ret = nla_validate_nested(nest, ARRAY_SIZE(strset_stringsets_policy) - 1, strset_stringsets_policy, extack); if (ret < 0) return ret; req_info->counts_only = tb[ETHTOOL_A_STRSET_COUNTS_ONLY]; nla_for_each_nested(attr, nest, rem) { u32 id; if (WARN_ONCE(nla_type(attr) != ETHTOOL_A_STRINGSETS_STRINGSET, "unexpected attrtype %u in ETHTOOL_A_STRSET_STRINGSETS\n", nla_type(attr))) return -EINVAL; ret = strset_get_id(attr, &id, extack); if (ret < 0) return ret; if (id >= ETH_SS_COUNT) { NL_SET_ERR_MSG_ATTR(extack, attr, "unknown string set id"); return -EOPNOTSUPP; } req_info->req_ids |= (1U << id); } return 0; } static void strset_cleanup_data(struct ethnl_reply_data *reply_base) { struct strset_reply_data *data = STRSET_REPDATA(reply_base); unsigned int i; for (i = 0; i < ETH_SS_COUNT; i++) if (data->sets[i].free_strings) { kfree(data->sets[i].strings); data->sets[i].strings = NULL; data->sets[i].free_strings = false; } } static int strset_prepare_set(struct strset_info *info, struct net_device *dev, unsigned int id, bool counts_only) { const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; void *strings; int count, ret; if (id == ETH_SS_PHY_STATS && dev->phydev && !ops->get_ethtool_phy_stats && phy_ops && phy_ops->get_sset_count) ret = phy_ops->get_sset_count(dev->phydev); else if (ops->get_sset_count && ops->get_strings) ret = ops->get_sset_count(dev, id); else ret = -EOPNOTSUPP; if (ret <= 0) { info->count = 0; return 0; } count = ret; if (!counts_only) { strings = kcalloc(count, ETH_GSTRING_LEN, GFP_KERNEL); if (!strings) return -ENOMEM; if (id == ETH_SS_PHY_STATS && dev->phydev && !ops->get_ethtool_phy_stats && phy_ops && phy_ops->get_strings) phy_ops->get_strings(dev->phydev, strings); else ops->get_strings(dev, id, strings); info->strings = strings; info->free_strings = true; } info->count = count; return 0; } static int strset_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) { const struct strset_req_info *req_info = STRSET_REQINFO(req_base); struct strset_reply_data *data = STRSET_REPDATA(reply_base); struct net_device *dev = reply_base->dev; unsigned int i; int ret; BUILD_BUG_ON(ARRAY_SIZE(info_template) != ETH_SS_COUNT); memcpy(&data->sets, &info_template, sizeof(data->sets)); if (!dev) { for (i = 0; i < ETH_SS_COUNT; i++) { if ((req_info->req_ids & (1U << i)) && data->sets[i].per_dev) { if (info) GENL_SET_ERR_MSG(info, "requested per device strings without dev"); return -EINVAL; } } return 0; } ret = ethnl_ops_begin(dev); if (ret < 0) goto err_strset; for (i = 0; i < ETH_SS_COUNT; i++) { if (!strset_include(req_info, data, i) || !data->sets[i].per_dev) continue; ret = strset_prepare_set(&data->sets[i], dev, i, req_info->counts_only); if (ret < 0) goto err_ops; } ethnl_ops_complete(dev); return 0; err_ops: ethnl_ops_complete(dev); err_strset: strset_cleanup_data(reply_base); return ret; } /* calculate size of ETHTOOL_A_STRSET_STRINGSET nest for one string set */ static int strset_set_size(const struct strset_info *info, bool counts_only) { unsigned int len = 0; unsigned int i; if (info->count == 0) return 0; if (counts_only) return nla_total_size(2 * nla_total_size(sizeof(u32))); for (i = 0; i < info->count; i++) { const char *str = info->strings[i]; /* ETHTOOL_A_STRING_INDEX, ETHTOOL_A_STRING_VALUE, nest */ len += nla_total_size(nla_total_size(sizeof(u32)) + ethnl_strz_size(str)); } /* ETHTOOL_A_STRINGSET_ID, ETHTOOL_A_STRINGSET_COUNT */ len = 2 * nla_total_size(sizeof(u32)) + nla_total_size(len); return nla_total_size(len); } static int strset_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct strset_req_info *req_info = STRSET_REQINFO(req_base); const struct strset_reply_data *data = STRSET_REPDATA(reply_base); unsigned int i; int len = 0; int ret; len += nla_total_size(0); /* ETHTOOL_A_STRSET_STRINGSETS */ for (i = 0; i < ETH_SS_COUNT; i++) { const struct strset_info *set_info = &data->sets[i]; if (!strset_include(req_info, data, i)) continue; ret = strset_set_size(set_info, req_info->counts_only); if (ret < 0) return ret; len += ret; } return len; } /* fill one string into reply */ static int strset_fill_string(struct sk_buff *skb, const struct strset_info *set_info, u32 idx) { struct nlattr *string_attr; const char *value; value = set_info->strings[idx]; string_attr = nla_nest_start(skb, ETHTOOL_A_STRINGS_STRING); if (!string_attr) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_STRING_INDEX, idx) || ethnl_put_strz(skb, ETHTOOL_A_STRING_VALUE, value)) goto nla_put_failure; nla_nest_end(skb, string_attr); return 0; nla_put_failure: nla_nest_cancel(skb, string_attr); return -EMSGSIZE; } /* fill one string set into reply */ static int strset_fill_set(struct sk_buff *skb, const struct strset_info *set_info, u32 id, bool counts_only) { struct nlattr *stringset_attr; struct nlattr *strings_attr; unsigned int i; if (!set_info->per_dev && !set_info->strings) return -EOPNOTSUPP; if (set_info->count == 0) return 0; stringset_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSETS_STRINGSET); if (!stringset_attr) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_STRINGSET_ID, id) || nla_put_u32(skb, ETHTOOL_A_STRINGSET_COUNT, set_info->count)) goto nla_put_failure; if (!counts_only) { strings_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSET_STRINGS); if (!strings_attr) goto nla_put_failure; for (i = 0; i < set_info->count; i++) { if (strset_fill_string(skb, set_info, i) < 0) goto nla_put_failure; } nla_nest_end(skb, strings_attr); } nla_nest_end(skb, stringset_attr); return 0; nla_put_failure: nla_nest_cancel(skb, stringset_attr); return -EMSGSIZE; } static int strset_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct strset_req_info *req_info = STRSET_REQINFO(req_base); const struct strset_reply_data *data = STRSET_REPDATA(reply_base); struct nlattr *nest; unsigned int i; int ret; nest = nla_nest_start(skb, ETHTOOL_A_STRSET_STRINGSETS); if (!nest) return -EMSGSIZE; for (i = 0; i < ETH_SS_COUNT; i++) { if (strset_include(req_info, data, i)) { ret = strset_fill_set(skb, &data->sets[i], i, req_info->counts_only); if (ret < 0) goto nla_put_failure; } } nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return ret; } const struct ethnl_request_ops ethnl_strset_request_ops = { .request_cmd = ETHTOOL_MSG_STRSET_GET, .reply_cmd = ETHTOOL_MSG_STRSET_GET_REPLY, .hdr_attr = ETHTOOL_A_STRSET_HEADER, .req_info_size = sizeof(struct strset_req_info), .reply_data_size = sizeof(struct strset_reply_data), .allow_nodev_do = true, .parse_request = strset_parse_request, .prepare_data = strset_prepare_data, .reply_size = strset_reply_size, .fill_reply = strset_fill_reply, .cleanup_data = strset_cleanup_data, }; |
92 72 1 19 20 981 30 952 27 15 26 1020 52 15 264 204 42 245 188 37 632 633 608 46 1 400 9 39 239 239 30 212 42 211 239 1 196 41 232 8 6 109 124 9 1 14 222 29 197 438 441 251 1150 1139 33 9 9 9 19 12 7 19 1 1253 1252 2 1041 863 762 285 518 763 760 5 68 1065 1232 330 30 92 380 397 397 369 42 355 77 8 397 87 341 394 230 276 276 276 221 221 71 36 120 32 978 113 76 949 173 53 145 173 666 119 648 339 649 56 8 647 647 215 638 18 21 21 21 8 6 7 1 159 159 133 35 159 152 97 23 158 24 157 45 19 145 32 87 96 39 30 4 5 5 159 5 45 148 139 140 106 8 72 29 118 124 124 124 10 35 2 31 42 42 123 123 114 9 8 106 114 99 45 44 92 114 114 8 31 31 2 31 811 793 29 29 29 175 146 86 33 33 86 86 86 86 86 29 4 33 86 23 23 23 23 11 22 8 20 20 41 18 1 1 1 8 3 19 14 19 19 20 7 37 31 24 37 37 36 41 41 2 10 10 9 3 3 140 70 88 140 154 34 55 55 2 34 169 169 169 13 85 168 44 129 81 132 15 15 3 22 5 25 17 21 25 21 21 25 21 15 55 55 4 22 22 55 55 5 28 25 15 16 11 4 15 86 86 21 176 176 61 15 68 171 171 171 169 2 169 28 169 169 155 77 60 155 245 1 63 18 167 14 131 44 44 176 175 176 147 86 86 38 55 84 85 84 144 174 142 1 1 133 133 39 32 1 20 2 2 2 36 32 32 29 121 83 38 14 32 3 36 10 32 3 54 54 50 4 12 12 10 10 54 54 34 26 105 105 159 159 159 201 202 4 198 19 76 76 62 27 76 68 11 17 61 25 39 3 42 42 42 29 42 63 63 63 63 26 18 42 28 62 1 1 58 8 28 28 22 7 22 18 4 6 14 8 1 24 24 24 709 709 462 247 172 38 145 173 30 152 113 39 45 70 113 113 173 173 173 132 58 58 2 1 16 39 37 19 202 332 93 11 3 57 57 18 41 1 138 138 47 47 27 24 5 45 31 2 43 14 31 45 2 13 30 34 11 43 2 3 42 342 340 424 125 311 424 424 280 156 338 108 215 131 213 128 215 132 128 213 240 87 104 241 333 6 424 1497 14 11 1493 2 1491 1 630 4 6 10 68 9 65 9 1 439 248 688 689 616 615 5 899 730 6 9 721 1157 1160 1160 978 411 9 566 596 468 111 264 960 3 42 967 5 985 1013 19 3 2 4 738 685 4 42 93 2 970 18 609 1 372 938 43 980 2 921 10 56 162 818 972 117 2 2 611 365 978 684 4 975 959 15 1 370 29 596 973 602 970 973 15 358 89 575 971 4 971 30 864 32 600 270 865 23 576 789 1 9 14 8 17 6 14 2 959 19 7 961 111 1036 24 6 6 261 124 5 5 5 5 5 265 265 264 261 265 1045 247 1044 902 197 132 132 1044 1044 1045 1045 10 1036 1046 501 1006 75 48 46 2 1040 10 369 29 661 1037 11 409 663 1046 24 779 265 1043 29 29 1043 1043 1044 145 3 141 142 142 127 1 5 119 3 1 120 92 119 88 28 6 2 21 2 21 55 34 57 6 26 39 15 49 39 11 11 49 11 39 11 52 112 112 31 4 39 37 2 34 5 34 4 34 5 37 3 36 3 39 18 13 5 18 17 206 358 519 534 473 473 474 54 125 1044 924 126 1046 1058 2 17 1053 174 1 81 3 49 85 106 171 172 172 3 3 3 3 1027 1029 19 941 180 1022 772 7 767 7 7 19 2 12 13 5 12 6 7 4 9 9 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/inode.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) * * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 */ #include <linux/fs.h> #include <linux/mount.h> #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/dax.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/kernel.h> #include <linux/printk.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/iomap.h> #include <linux/iversion.h> #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "truncate.h" #include <trace/events/ext4.h> static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); csum = ext4_chksum(sbi, csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } return csum; } static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 provided, calculated; if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || !ext4_has_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); calculated = ext4_inode_csum(inode, raw, ei); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; else calculated &= 0xFFFF; return provided == calculated; } void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 csum; if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || !ext4_has_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) raw->i_checksum_hi = cpu_to_le16(csum >> 16); } static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { trace_ext4_begin_ordered_truncate(inode, new_size); /* * If jinode is zero, then we never opened the file for * writing, so there's no need to call * jbd2_journal_begin_ordered_truncate() since there's no * outstanding writes we need to flush. */ if (!EXT4_I(inode)->jinode) return 0; return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode, new_size); } static void ext4_invalidatepage(struct page *page, unsigned int offset, unsigned int length); static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. */ int ext4_inode_is_fast_symlink(struct inode *inode) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; if (ext4_has_inline_data(inode)) return 0; return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } return S_ISLNK(inode->i_mode) && inode->i_size && (inode->i_size < EXT4_N_BLOCKS * 4); } /* * Called at the last iput() if i_nlink is zero. */ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; /* * Credits for final inode cleanup and freeing: * sb + inode (ext4_orphan_del()), block bitmap, group descriptor * (xattr block freeing), bitmap, group descriptor (inode freeing) */ int extra_credits = 6; struct ext4_xattr_inode_array *ea_inode_array = NULL; bool freeze_protected = false; trace_ext4_evict_inode(inode); if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { /* * When journalling data dirty buffers are tracked only in the * journal. So although mm thinks everything is clean and * ready for reaping the inode might still have some pages to * write in the running transaction or waiting to be * checkpointed. Thus calling jbd2_journal_invalidatepage() * (via truncate_inode_pages()) to discard these buffers can * cause data loss. Also even if we did not discard these * buffers, we would have no way to find them after the inode * is reaped and thus user could see stale data if he tries to * read them before the transaction is checkpointed. So be * careful and force everything to disk here... We use * ei->i_datasync_tid to store the newest transaction * containing inode's data. * * Note that directories do not have this problem because they * don't use page cache. */ if (inode->i_ino != EXT4_JOURNAL_INO && ext4_should_journal_data(inode) && (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && inode->i_data.nrpages) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; jbd2_complete_transaction(journal, commit_tid); filemap_write_and_wait(&inode->i_data); } truncate_inode_pages_final(&inode->i_data); goto no_delete; } if (is_bad_inode(inode)) goto no_delete; dquot_initialize(inode); if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); /* * For inodes with journalled data, transaction commit could have * dirtied the inode. And for inodes with dioread_nolock, unwritten * extents converting worker could merge extents and also have dirtied * the inode. Flush worker is ignoring it because of I_FREEING flag but * we still need to remove the inode from the writeback lists. */ if (!list_empty_careful(&inode->i_io_list)) inode_io_list_del(inode); /* * Protect us against freezing - iput() caller didn't have to have any * protection against it. When we are in a running transaction though, * we are already protected against freezing and we cannot grab further * protection due to lock ordering constraints. */ if (!ext4_journal_current_handle()) { sb_start_intwrite(inode->i_sb); freeze_protected = true; } if (!IS_NOQUOTA(inode)) extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); /* * Block bitmap, group descriptor, and inode are accounted in both * ext4_blocks_for_truncate() and extra_credits. So subtract 3. */ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, ext4_blocks_for_truncate(inode) + extra_credits - 3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* * If we're going to skip the normal cleanup, we still need to * make sure that the in-core orphan linked list is properly * cleaned up. */ ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); goto no_delete; } if (IS_SYNC(inode)) ext4_handle_sync(handle); /* * Set inode->i_size to 0 before calling ext4_truncate(). We need * special handling of symlinks here because i_size is used to * determine whether ext4_inode_info->i_data contains symlink data or * block mappings. Setting i_size to 0 will remove its fast symlink * status. Erase i_data so that it becomes a valid empty block map. */ if (ext4_inode_is_fast_symlink(inode)) memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data)); inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_warning(inode->i_sb, "couldn't mark inode dirty (err %d)", err); goto stop_handle; } if (inode->i_blocks) { err = ext4_truncate(inode); if (err) { ext4_error_err(inode->i_sb, -err, "couldn't truncate inode %lu (err %d)", inode->i_ino, err); goto stop_handle; } } /* Remove xattr references. */ err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, extra_credits); if (err) { ext4_warning(inode->i_sb, "xattr delete (err %d)", err); stop_handle: ext4_journal_stop(handle); ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); goto no_delete; } /* * Kill off the orphan record which ext4_truncate created. * AKPM: I think this can be inside the above `if'. * Note that ext4_orphan_del() has to be able to cope with the * deletion of a non-existent orphan - this is because we don't * know if ext4_truncate() actually created an orphan record. * (Well, we could do this if we need to, but heck - it works) */ ext4_orphan_del(handle, inode); EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds(); /* * One subtle ordering requirement: if anything has gone wrong * (transaction abort, IO errors, whatever), then we can still * do these next steps (the fs will already have been marked as * having errors), but we can't free the inode if the mark_dirty * fails. */ if (ext4_mark_inode_dirty(handle, inode)) /* If that failed, just do the required in-core inode clear. */ ext4_clear_inode(inode); else ext4_free_inode(handle, inode); ext4_journal_stop(handle); if (freeze_protected) sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: /* * Check out some where else accidentally dirty the evicting inode, * which may probably cause inode use-after-free issues later. */ WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list)); if (!list_empty(&EXT4_I(inode)->i_fc_list)) ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } #ifdef CONFIG_QUOTA qsize_t *ext4_get_reserved_space(struct inode *inode) { return &EXT4_I(inode)->i_reserved_quota; } #endif /* * Called with i_data_sem down, which is important since we can call * ext4_discard_preallocations() from here. */ void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); spin_lock(&ei->i_block_reservation_lock); trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_warning(inode->i_sb, "%s: ino %lu, used %d " "with only %d reserved data blocks", __func__, inode->i_ino, used, ei->i_reserved_data_blocks); WARN_ON(1); used = ei->i_reserved_data_blocks; } /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); spin_unlock(&ei->i_block_reservation_lock); /* Update quota subsystem for data blocks */ if (quota_claim) dquot_claim_block(inode, EXT4_C2B(sbi, used)); else { /* * We did fallocate with an offset that is already delayed * allocated. So on delayed allocated writeback we should * not re-claim the quota for fallocated blocks. */ dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); } /* * If we have done all the pending block allocations and if * there aren't any writers on the inode, we can discard the * inode's preallocations. */ if ((ei->i_reserved_data_blocks == 0) && !inode_is_open_for_write(inode)) ext4_discard_preallocations(inode, 0); } static int __check_block_validity(struct inode *inode, const char *func, unsigned int line, struct ext4_map_blocks *map) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; if (journal && inode == journal->j_inode) return 0; if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " "(length %d)", (unsigned long) map->m_lblk, map->m_pblk, map->m_len); return -EFSCORRUPTED; } return 0; } int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len) { int ret; if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); if (ret > 0) ret = 0; return ret; } #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) #ifdef ES_AGGRESSIVE_TEST static void ext4_map_blocks_es_recheck(handle_t *handle, struct inode *inode, struct ext4_map_blocks *es_map, struct ext4_map_blocks *map, int flags) { int retval; map->m_flags = 0; /* * There is a race window that the result is not the same. * e.g. xfstests #223 when dioread_nolock enables. The reason * is that we lookup a block mapping in extent status tree with * out taking i_data_sem. So at the time the unwritten extent * could be converted. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { retval = ext4_ind_map_blocks(handle, inode, map, 0); } up_read((&EXT4_I(inode)->i_data_sem)); /* * We don't check m_len because extent will be collpased in status * tree. So the m_len might not equal. */ if (es_map->m_lblk != map->m_lblk || es_map->m_flags != map->m_flags || es_map->m_pblk != map->m_pblk) { printk("ES cache assertion failed for inode: %lu " "es_cached ex [%d/%d/%llu/%x] != " "found ex [%d/%d/%llu/%x] retval %d flags %x\n", inode->i_ino, es_map->m_lblk, es_map->m_len, es_map->m_pblk, es_map->m_flags, map->m_lblk, map->m_len, map->m_pblk, map->m_flags, retval, flags); } } #endif /* ES_AGGRESSIVE_TEST */ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map) { unsigned int status; int retval; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) retval = ext4_ext_map_blocks(handle, inode, map, 0); else retval = ext4_ind_map_blocks(handle, inode, map, 0); if (retval <= 0) return retval; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); return retval; } /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * * Otherwise it takes the write lock of the i_data_sem and allocate blocks * and store the allocated blocks in the result buffer head and mark it * mapped. * * If file type is extents based, it will call ext4_ext_map_blocks(), * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * * On success, it returns the number of blocks being mapped or allocated. if * create==0 and the blocks are pre-allocated and unwritten, the resulting @map * is marked as unwritten. If the create == 1, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in * that case, @map is returned as unmapped but we still do fill map->m_len to * indicate the length of a hole starting at map->m_lblk. * * It returns the error in case of allocation failure. */ int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct extent_status es; int retval; int ret = 0; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif map->m_flags = 0; ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n", flags, map->m_len, (unsigned long) map->m_lblk); /* * ext4_map_blocks returns an int, and m_len is an unsigned int */ if (unlikely(map->m_len > INT_MAX)) map->m_len = INT_MAX; /* We can handle the block number less than EXT_MAX_BLOCKS */ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; /* Lookup extent status tree firstly */ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; map->m_flags |= ext4_es_is_written(&es) ? EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { map->m_pblk = 0; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; retval = 0; } else { BUG(); } #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif goto found; } /* * Try to see if we can get the block without requesting a new * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { retval = ext4_ind_map_blocks(handle, inode, map, 0); } if (retval > 0) { unsigned int status; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && !(status & EXTENT_STATUS_WRITTEN) && ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); } up_read((&EXT4_I(inode)->i_data_sem)); found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; } /* If it is only a block(s) look up */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) return retval; /* * Returns if the blocks have already allocated * * Note that if blocks have been preallocated * ext4_ext_get_block() returns the create = 0 * with buffer head unmapped. */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) /* * If we need to convert extent to unwritten * we continue and do the actual work in * ext4_ext_map_blocks() */ if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; /* * Here we clear m_flags because after allocating an new extent, * it will be set again. */ map->m_flags &= ~EXT4_MAP_FLAGS; /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take * the write lock of i_data_sem, and call get_block() * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); /* * We need to check for EXT4 here because migrate * could have changed the inode type in between */ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags); } else { retval = ext4_ind_map_blocks(handle, inode, map, flags); if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { /* * We allocated new blocks which will result in * i_data's format changing. Force the migrate * to fail by clearing migrate flags */ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); } } if (retval > 0) { unsigned int status; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } /* * We have to zeroout blocks before inserting them into extent * status tree. Otherwise someone could look them up there and * use them before they are really zeroed. We also have to * unmap metadata before zeroing as otherwise writeback can * overwrite zeros with stale data from block device. */ if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { ret = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (ret) { retval = ret; goto out_sem; } } /* * If the extent has been zeroed out, we don't need to update * extent status tree. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO) && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es)) goto out_sem; } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && !(status & EXTENT_STATUS_WRITTEN) && ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); } out_sem: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; /* * Inodes with freshly allocated blocks where contents will be * visible after transaction commit must be on transaction's * ordered data list. */ if (map->m_flags & EXT4_MAP_NEW && !(map->m_flags & EXT4_MAP_UNWRITTEN) && !(flags & EXT4_GET_BLOCKS_ZERO) && !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { loff_t start_byte = (loff_t)map->m_lblk << inode->i_blkbits; loff_t length = (loff_t)map->m_len << inode->i_blkbits; if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode, start_byte, length); else ret = ext4_jbd2_inode_add_write(handle, inode, start_byte, length); if (ret) return ret; } } if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || map->m_flags & EXT4_MAP_MAPPED)) ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + map->m_len - 1); if (retval < 0) ext_debug(inode, "failed with err %d\n", retval); return retval; } /* * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages * we have to be careful as someone else may be manipulating b_state as well. */ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) { unsigned long old_state; unsigned long new_state; flags &= EXT4_MAP_FLAGS; /* Dummy buffer_head? Set non-atomically. */ if (!bh->b_page) { bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; return; } /* * Someone else may be modifying b_state. Be careful! This is ugly but * once we get rid of using bh as a container for mapping information * to pass to / from get_block functions, this can go away. */ do { old_state = READ_ONCE(bh->b_state); new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; } while (unlikely( cmpxchg(&bh->b_state, old_state, new_state) != old_state)); } static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { struct ext4_map_blocks map; int ret = 0; if (ext4_has_inline_data(inode)) return -ERANGE; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, flags); if (ret > 0) { map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } else if (ret == 0) { /* hole case, need to fill in bh->b_size */ bh->b_size = inode->i_sb->s_blocksize * map.m_len; } return ret; } int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { return _ext4_get_block(inode, iblock, bh, create ? EXT4_GET_BLOCKS_CREATE : 0); } /* * Get block function used when preparing for buffered write if we require * creating an unwritten extent if blocks haven't been allocated. The extent * will be converted to written after the IO is complete. */ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", inode->i_ino, create); return _ext4_get_block(inode, iblock, bh_result, EXT4_GET_BLOCKS_IO_CREATE_EXT); } /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 /* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct ext4_map_blocks map; struct buffer_head *bh; int create = map_flags & EXT4_GET_BLOCKS_CREATE; int err; ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || handle != NULL || create == 0); map.m_lblk = block; map.m_len = 1; err = ext4_map_blocks(handle, inode, &map, map_flags); if (err == 0) return create ? ERR_PTR(-ENOSPC) : NULL; if (err < 0) return ERR_PTR(err); bh = sb_getblk(inode->i_sb, map.m_pblk); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { ASSERT(create != 0); ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || (handle != NULL)); /* * Now that we do not always journal data, we should * keep in mind whether this should always journal the * new buffer as metadata. For now, regular file * writes use ext4_get_block instead, so it's not a * problem. */ lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (unlikely(err)) { unlock_buffer(bh); goto errout; } if (!buffer_uptodate(bh)) { memset(bh->b_data, 0, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); } unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); if (unlikely(err)) goto errout; } else BUFFER_TRACE(bh, "not a new buffer"); return bh; errout: brelse(bh); return ERR_PTR(err); } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct buffer_head *bh; int ret; bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; if (!bh || ext4_buffer_uptodate(bh)) return bh; ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true); if (ret) { put_bh(bh); return ERR_PTR(ret); } return bh; } /* Read a contiguous batch of blocks. */ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, bool wait, struct buffer_head **bhs) { int i, err; for (i = 0; i < bh_count; i++) { bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); if (IS_ERR(bhs[i])) { err = PTR_ERR(bhs[i]); bh_count = i; goto out_brelse; } } for (i = 0; i < bh_count; i++) /* Note that NULL bhs[i] is valid because of holes. */ if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false); if (!wait) return 0; for (i = 0; i < bh_count; i++) if (bhs[i]) wait_on_buffer(bhs[i]); for (i = 0; i < bh_count; i++) { if (bhs[i] && !buffer_uptodate(bhs[i])) { err = -EIO; goto out_brelse; } } return 0; out_brelse: for (i = 0; i < bh_count; i++) { brelse(bhs[i]); bhs[i] = NULL; } return err; } int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; unsigned blocksize = head->b_size; int err, ret = 0; struct buffer_head *next; for (bh = head, block_start = 0; ret == 0 && (bh != head || !block_start); block_start = block_end, bh = next) { next = bh->b_this_page; block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (partial && !buffer_uptodate(bh)) *partial = 1; continue; } err = (*fn)(handle, inode, bh); if (!ret) ret = err; } return ret; } /* * To preserve ordering, it is essential that the hole instantiation and * the data write be encapsulated in a single transaction. We cannot * close off a transaction and start a new one between the ext4_get_block() * and the commit_write(). So doing the jbd2_journal_start at the start of * prepare_write() is the right place. * * Also, this function can nest inside ext4_writepage(). In that case, we * *know* that ext4_writepage() has generated enough buffer credits to do the * whole page. So we won't block on the journal in that case, which is good, * because the caller may be PF_MEMALLOC. * * By accident, ext4 can be reentered when a transaction is open via * quota file writes. If we were to commit the transaction while thus * reentered, there can be a deadlock - we would be holding a quota * lock, and the commit would never complete if another thread had a * transaction open and was blocking on the quota lock - a ranking * violation. * * So what we do is to rely on the fact that jbd2_journal_stop/journal_start * will _not_ run commit under these circumstances because handle->h_ref * is elevated. We'll still have enough credits for the tiny quotafile * write. */ int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { int dirty = buffer_dirty(bh); int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; /* * __block_write_begin() could have dirtied some buffers. Clean * the dirty bit as jbd2_journal_get_write_access() could complain * otherwise about fs integrity issues. Setting of the dirty bit * by __block_write_begin() isn't a real problem here as we clear * the bit before releasing a page lock and thus writeback cannot * ever write the buffer. */ if (dirty) clear_buffer_dirty(bh); BUFFER_TRACE(bh, "get write access"); ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (!ret && dirty) ret = ext4_handle_dirty_metadata(handle, NULL, bh); return ret; } #ifdef CONFIG_FS_ENCRYPTION static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; struct inode *inode = page->mapping->host; unsigned block_start, block_end; sector_t block; int err = 0; unsigned blocksize = inode->i_sb->s_blocksize; unsigned bbits; struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; BUG_ON(!PageLocked(page)); BUG_ON(from > PAGE_SIZE); BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); head = page_buffers(page); bbits = ilog2(blocksize); block = (sector_t)page->index << (PAGE_SHIFT - bbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (PageUptodate(page)) { set_buffer_uptodate(bh); } continue; } if (buffer_new(bh)) clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1); if (err) break; if (buffer_new(bh)) { if (PageUptodate(page)) { clear_buffer_new(bh); set_buffer_uptodate(bh); mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) zero_user_segments(page, to, block_end, block_start, from); continue; } } if (PageUptodate(page)) { set_buffer_uptodate(bh); continue; } if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { ext4_read_bh_lock(bh, 0, false); wait[nr_wait++] = bh; } } /* * If we issued read requests, let them complete. */ for (i = 0; i < nr_wait; i++) { wait_on_buffer(wait[i]); if (!buffer_uptodate(wait[i])) err = -EIO; } if (unlikely(err)) { page_zero_new_buffers(page, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; err2 = fscrypt_decrypt_pagecache_blocks(page, blocksize, bh_offset(wait[i])); if (err2) { clear_buffer_uptodate(wait[i]); err = err2; } } } return err; } #endif static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; handle_t *handle; int retries = 0; struct page *page; pgoff_t index; unsigned from, to; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; trace_ext4_write_begin(inode, pos, len, flags); /* * Reserve one block more for addition to orphan list in case * we allocate blocks but write fails for some reason */ needed_blocks = ext4_writepage_trans_blocks(inode) + 1; index = pos >> PAGE_SHIFT; from = pos & (PAGE_SIZE - 1); to = from + len; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, flags, pagep); if (ret < 0) return ret; if (ret == 1) return 0; } /* * grab_cache_page_write_begin() can take a long time if the * system is thrashing due to memory pressure, or if the page * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate * the page (if needed) without using GFP_NOFS. */ retry_grab: page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; /* * The same as page allocation, we prealloc buffer heads before * starting the handle. */ if (!page_has_buffers(page)) create_empty_buffers(page, inode->i_sb->s_blocksize, 0); unlock_page(page); retry_journal: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { put_page(page); return PTR_ERR(handle); } lock_page(page); if (page->mapping != mapping) { /* The page got truncated from under us */ unlock_page(page); put_page(page); ext4_journal_stop(handle); goto retry_grab; } /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); #ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(page, pos, len, ext4_get_block_unwritten); else ret = ext4_block_write_begin(page, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) ret = __block_write_begin(page, pos, len, ext4_get_block_unwritten); else ret = __block_write_begin(page, pos, len, ext4_get_block); #endif if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), from, to, NULL, do_journal_get_write_access); } if (ret) { bool extended = (pos + len > inode->i_size) && !ext4_verity_in_progress(inode); unlock_page(page); /* * __block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_mutex. * * Add inode to orphan list in case we crash before * truncate finishes */ if (extended && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ext4_journal_stop(handle); if (extended) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to * make sure the inode is removed from the * orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; put_page(page); return ret; } *pagep = page; return ret; } /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh) { int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); ret = ext4_handle_dirty_metadata(handle, NULL, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); return ret; } /* * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * * ext4 never places buffers on inode->i_mapping->private_list. metadata * buffers are managed internally. */ static int ext4_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); if (ext4_has_inline_data(inode) && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return ext4_write_inline_data_end(inode, pos, len, copied, page); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* * it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. * * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree * blocks are being written past EOF, so skip the i_size update. */ if (!verity) i_size_changed = ext4_update_inode_size(inode, pos + copied); unlock_page(page); put_page(page); if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily * makes the holding time of page lock longer. Second, it forces lock * ordering of page lock and transaction start for journaling * filesystems. */ if (i_size_changed) ret = ext4_mark_inode_dirty(handle, inode); if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * This is a private version of page_zero_new_buffers() which doesn't * set the buffer to be dirty, since in data=journalled mode we need * to call ext4_handle_dirty_metadata() instead. */ static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, struct page *page, unsigned from, unsigned to) { unsigned int block_start = 0, block_end; struct buffer_head *head, *bh; bh = head = page_buffers(page); do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { if (!PageUptodate(page)) { unsigned start, size; start = max(from, block_start); size = min(to, block_end) - start; zero_user(page, start, size); write_end_fn(handle, inode, bh); } clear_buffer_new(bh); } } block_start = block_end; bh = bh->b_this_page; } while (bh != head); } static int ext4_journalled_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; int ret = 0, ret2; int partial = 0; unsigned from, to; int size_changed = 0; bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_SIZE - 1); to = from + len; BUG_ON(!ext4_handle_valid(handle)); if (ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, page); if (unlikely(copied < len) && !PageUptodate(page)) { copied = 0; ext4_journalled_zero_new_buffers(handle, inode, page, from, to); } else { if (unlikely(copied < len)) ext4_journalled_zero_new_buffers(handle, inode, page, from + copied, to); ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), from, from + copied, &partial, write_end_fn); if (!partial) SetPageUptodate(page); } if (!verity) size_changed = ext4_update_inode_size(inode, pos + copied); ext4_set_inode_state(inode, EXT4_STATE_JDATA); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; unlock_page(page); put_page(page); if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; } if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * Reserve space for a single cluster */ static int ext4_da_reserve_space(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); int ret; /* * We will charge metadata quota at writeout time; this saves * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); if (ret) return ret; spin_lock(&ei->i_block_reservation_lock); if (ext4_claim_free_clusters(sbi, 1, 0)) { spin_unlock(&ei->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); return -ENOSPC; } ei->i_reserved_data_blocks++; trace_ext4_da_reserve_space(inode); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ } void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); if (!to_free) return; /* Nothing to release, exit */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); trace_ext4_da_release_space(inode, to_free); if (unlikely(to_free > ei->i_reserved_data_blocks)) { /* * if there aren't enough reserved blocks, then the * counter is messed up somewhere. Since this * function is called from invalidate page, it's * harmless to return without any action. */ ext4_warning(inode->i_sb, "ext4_da_release_space: " "ino %lu, to_free %d with only %d reserved " "data blocks", inode->i_ino, to_free, ei->i_reserved_data_blocks); WARN_ON(1); to_free = ei->i_reserved_data_blocks; } ei->i_reserved_data_blocks -= to_free; /* update fs dirty data blocks counter */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); } /* * Delayed allocation stuff */ struct mpage_da_data { struct inode *inode; struct writeback_control *wbc; pgoff_t first_page; /* The first page to write */ pgoff_t next_page; /* Current page to examine */ pgoff_t last_page; /* Last page to examine */ /* * Extent to map - this can be after first_page because that can be * fully mapped. We somewhat abuse m_flags to store whether the extent * is delalloc or unwritten. */ struct ext4_map_blocks map; struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; unsigned int scanned_until_end:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, bool invalidate) { int nr_pages, i; pgoff_t index, end; struct pagevec pvec; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; /* This is necessary when next_page == 0. */ if (mpd->first_page >= mpd->next_page) return; mpd->scanned_until_end = 0; index = mpd->first_page; end = mpd->next_page - 1; if (invalidate) { ext4_lblk_t start, last; start = index << (PAGE_SHIFT - inode->i_blkbits); last = end << (PAGE_SHIFT - inode->i_blkbits); /* * avoid racing with extent status tree scans made by * ext4_insert_delayed_block() */ down_write(&EXT4_I(inode)->i_data_sem); ext4_es_remove_extent(inode, start, last - start + 1); up_write(&EXT4_I(inode)->i_data_sem); } pagevec_init(&pvec); while (index <= end) { nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); if (invalidate) { if (page_mapped(page)) clear_page_dirty_for_io(page); block_invalidatepage(page, 0, PAGE_SIZE); ClearPageUptodate(page); } unlock_page(page); } pagevec_release(&pvec); } } static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", EXT4_C2B(EXT4_SB(inode->i_sb), ext4_count_free_clusters(sb))); ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_dirtyclusters_counter))); ext4_msg(sb, KERN_CRIT, "Block reservation details"); ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", ei->i_reserved_data_blocks); return; } static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode, struct buffer_head *bh) { return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); } /* * ext4_insert_delayed_block - adds a delayed block to the extents status * tree, incrementing the reserved cluster/block * count or making a pending reservation * where needed * * @inode - file containing the newly added block * @lblk - logical block to be added * * Returns 0 on success, negative error code on failure. */ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; bool allocated = false; bool reserved = false; /* * If the cluster containing lblk is shared with a delayed, * written, or unwritten extent in a bigalloc file system, it's * already been accounted for and does not need to be reserved. * A pending reservation must be made for the cluster if it's * shared with a written or unwritten extent and doesn't already * have one. Written and unwritten extents can be purged from the * extents status tree if the system is under memory pressure, so * it's necessary to examine the extent tree if a search of the * extents status tree doesn't get a match. */ if (sbi->s_cluster_ratio == 1) { ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ goto errout; reserved = true; } else { /* bigalloc */ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { if (!ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) { ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); if (ret < 0) goto errout; if (ret == 0) { ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ goto errout; reserved = true; } else { allocated = true; } } else { allocated = true; } } } ret = ext4_es_insert_delayed_block(inode, lblk, allocated); if (ret && reserved) ext4_da_release_space(inode, 1); errout: return ret; } /* * This function is grabs code from the very beginning of * ext4_map_blocks, but assumes that the caller is from delayed write * time. This function looks up the requested blocks and sets the * buffer delay bit under the protection of i_data_sem. */ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, struct ext4_map_blocks *map, struct buffer_head *bh) { struct extent_status es; int retval; sector_t invalid_block = ~((sector_t) 0xffff); #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) invalid_block = ~0; map->m_flags = 0; ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { if (ext4_es_is_hole(&es)) goto add_delayed; found: /* * Delayed extent could be allocated by fallocate. * So we need to check it. */ if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); return 0; } map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; retval = es.es_len - (iblock - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; if (ext4_es_is_written(&es)) map->m_flags |= EXT4_MAP_MAPPED; else if (ext4_es_is_unwritten(&es)) map->m_flags |= EXT4_MAP_UNWRITTEN; else BUG(); #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); #endif return retval; } /* * Try to see if we can get the block without requesting a new * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) retval = 0; else retval = ext4_map_query_blocks(NULL, inode, map); up_read(&EXT4_I(inode)->i_data_sem); if (retval) return retval; add_delayed: down_write(&EXT4_I(inode)->i_data_sem); /* * Page fault path (ext4_page_mkwrite does not take i_rwsem) * and fallocate path (no folio lock) can race. Make sure we * lookup the extent status tree here again while i_data_sem * is held in write mode, before inserting a new da entry in * the extent status tree. */ if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { if (!ext4_es_is_hole(&es)) { up_write(&EXT4_I(inode)->i_data_sem); goto found; } } else if (!ext4_has_inline_data(inode)) { retval = ext4_map_query_blocks(NULL, inode, map); if (retval) { up_write(&EXT4_I(inode)->i_data_sem); return retval; } } retval = ext4_insert_delayed_block(inode, map->m_lblk); up_write(&EXT4_I(inode)->i_data_sem); if (retval) return retval; map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); return retval; } /* * This is a special get_block_t callback which is used by * ext4_da_write_begin(). It will either return mapped block or * reserve space for a single block. * * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. * We also have b_blocknr = -1 and b_bdev initialized properly * * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev * initialized properly. */ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct ext4_map_blocks map; int ret = 0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); map.m_lblk = iblock; map.m_len = 1; /* * first, we need to know whether the block is allocated already * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ ret = ext4_da_map_blocks(inode, iblock, &map, bh); if (ret <= 0) return ret; map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); if (buffer_unwritten(bh)) { /* A delayed write to unwritten bh should be marked * new and mapped. Mapped ensures that we don't do * get_block multiple times when we write to the same * offset and new ensures that we do proper zero out * for partial write. */ set_buffer_new(bh); set_buffer_mapped(bh); } return 0; } static int __ext4_journalled_writepage(struct page *page, unsigned int len) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; handle_t *handle = NULL; int ret = 0, err = 0; int inline_data = ext4_has_inline_data(inode); struct buffer_head *inode_bh = NULL; loff_t size; ClearPageChecked(page); if (inline_data) { BUG_ON(page->index != 0); BUG_ON(len > ext4_get_max_inline_size(inode)); inode_bh = ext4_journalled_write_inline_data(inode, len, page); if (inode_bh == NULL) goto out; } /* * We need to release the page lock before we start the * journal, so grab a reference so the page won't disappear * out from under us. */ get_page(page); unlock_page(page); handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); put_page(page); goto out_no_pagelock; } BUG_ON(!ext4_handle_valid(handle)); lock_page(page); put_page(page); size = i_size_read(inode); if (page->mapping != mapping || page_offset(page) > size) { /* The page got truncated from under us */ ext4_journal_stop(handle); ret = 0; goto out; } if (inline_data) { ret = ext4_mark_inode_dirty(handle, inode); } else { struct buffer_head *page_bufs = page_buffers(page); if (page->index == size >> PAGE_SHIFT) len = size & ~PAGE_MASK; else len = PAGE_SIZE; ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, do_journal_get_write_access); err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, write_end_fn); } if (ret == 0) ret = err; err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; err = ext4_journal_stop(handle); if (!ret) ret = err; ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: unlock_page(page); out_no_pagelock: brelse(inode_bh); return ret; } /* * Note that we don't need to start a transaction unless we're journaling data * because we should have holes filled from ext4_page_mkwrite(). We even don't * need to file the inode to the transaction's list in ordered mode because if * we are writing back data added by write(), the inode is already there and if * we are writing back data modified via mmap(), no one guarantees in which * transaction the data will hit the disk. In case we are journaling data, we * cannot start transaction directly because transaction start ranks above page * lock so we have to do some magic. * * This function can get called via... * - ext4_writepages after taking page lock (have journal handle) * - journal_submit_inode_data_buffers (no journal handle) * - shrink_page_list via the kswapd/direct reclaim (no journal handle) * - grab_page_cache when doing write_begin (have journal handle) * * We don't do any block allocation in this function. If we have page with * multiple blocks we need to write those buffer_heads that are mapped. This * is important for mmaped based write. So if we do with blocksize 1K * truncate(f, 1024); * a = mmap(f, 0, 4096); * a[0] = 'a'; * truncate(f, 4096); * we have in the page first buffer_head mapped via page_mkwrite call back * but other buffer_heads would be unmapped but dirty (dirty done via the * do_wp_page). So writepage should write the first block. If we modify * the mmap area beyond 1024 we will again get a page_fault and the * page_mkwrite callback will do the block allocation and mark the * buffer_heads mapped. * * We redirty the page if we have any buffer_heads that is either delay or * unwritten in the page. * * We can get recursively called as show below. * * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> * ext4_writepage() * * But since we don't do any block allocation we should not deadlock. * Page also have the dirty flag cleared so we don't get recurive page_lock. */ static int ext4_writepage(struct page *page, struct writeback_control *wbc) { int ret = 0; loff_t size; unsigned int len; struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; struct ext4_io_submit io_submit; bool keep_towrite = false; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { inode->i_mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); unlock_page(page); return -EIO; } trace_ext4_writepage(page); size = i_size_read(inode); if (page->index == size >> PAGE_SHIFT && !ext4_verity_in_progress(inode)) len = size & ~PAGE_MASK; else len = PAGE_SIZE; /* Should never happen but for bugs in other kernel subsystems */ if (!page_has_buffers(page)) { ext4_warning_inode(inode, "page %lu does not have buffers attached", page->index); ClearPageDirty(page); unlock_page(page); return 0; } page_bufs = page_buffers(page); /* * We cannot do block allocation or other extent handling in this * function. If there are buffers needing that, we have to redirty * the page. But we may reach here when we do a journal commit via * journal_submit_inode_data_buffers() and in that case we must write * allocated buffers to achieve data=ordered mode guarantees. * * Also, if there is only one buffer per page (the fs block * size == the page size), if one buffer needs block * allocation or needs to modify the extent tree to clear the * unwritten flag, we know that the page can't be written at * all, so we might as well refuse the write immediately. * Unfortunately if the block size != page size, we can't as * easily detect this case using ext4_walk_page_buffers(), but * for the extremely common case, this is an optimization that * skips a useless round trip through ext4_bio_write_page(). */ if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, ext4_bh_delay_or_unwritten)) { redirty_page_for_writepage(wbc, page); if ((current->flags & PF_MEMALLOC) || (inode->i_sb->s_blocksize == PAGE_SIZE)) { /* * For memory cleaning there's no point in writing only * some buffers. So just bail out. Warn if we came here * from direct reclaim. */ WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC); unlock_page(page); return 0; } keep_towrite = true; } if (PageChecked(page) && ext4_should_journal_data(inode)) /* * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. */ return __ext4_journalled_writepage(page, len); ext4_io_submit_init(&io_submit, wbc); io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); if (!io_submit.io_end) { redirty_page_for_writepage(wbc, page); unlock_page(page); return -ENOMEM; } ret = ext4_bio_write_page(&io_submit, page, len, keep_towrite); ext4_io_submit(&io_submit); /* Drop io_end reference we got from init */ ext4_put_io_end_defer(io_submit.io_end); return ret; } static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) { int len; loff_t size; int err; BUG_ON(page->index != mpd->first_page); clear_page_dirty_for_io(page); /* * We have to be very careful here! Nothing protects writeback path * against i_size changes and the page can be writeably mapped into * page tables. So an application can be growing i_size and writing * data through mmap while writeback runs. clear_page_dirty_for_io() * write-protects our page in page tables and the page cannot get * written to again until we release page lock. So only after * clear_page_dirty_for_io() we are safe to sample i_size for * ext4_bio_write_page() to zero-out tail of the written page. We rely * on the barrier provided by TestClearPageDirty in * clear_page_dirty_for_io() to make sure i_size is really sampled only * after page tables are updated. */ size = i_size_read(mpd->inode); if (page->index == size >> PAGE_SHIFT && !ext4_verity_in_progress(mpd->inode)) len = size & ~PAGE_MASK; else len = PAGE_SIZE; err = ext4_bio_write_page(&mpd->io_submit, page, len, false); if (!err) mpd->wbc->nr_to_write--; mpd->first_page++; return err; } #define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay)) /* * mballoc gives us at most this number of blocks... * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). * The rest of mballoc seems to handle chunks up to full group size. */ #define MAX_WRITEPAGES_EXTENT_LEN 2048 /* * mpage_add_bh_to_extent - try to add bh to extent of blocks to map * * @mpd - extent of blocks * @lblk - logical number of the block in the file * @bh - buffer head we want to add to the extent * * The function is used to collect contig. blocks in the same state. If the * buffer doesn't require mapping for writeback and we haven't started the * extent of buffers to map yet, the function returns 'true' immediately - the * caller can write the buffer right away. Otherwise the function returns true * if the block has been added to the extent, false if the block couldn't be * added. */ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, struct buffer_head *bh) { struct ext4_map_blocks *map = &mpd->map; /* Buffer that doesn't need mapping for writeback? */ if (!buffer_dirty(bh) || !buffer_mapped(bh) || (!buffer_delay(bh) && !buffer_unwritten(bh))) { /* So far no extent to map => we write the buffer right away */ if (map->m_len == 0) return true; return false; } /* First block in the extent? */ if (map->m_len == 0) { /* We cannot map unless handle is started... */ if (!mpd->do_map) return false; map->m_lblk = lblk; map->m_len = 1; map->m_flags = bh->b_state & BH_FLAGS; return true; } /* Don't go larger than mballoc is willing to allocate */ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) return false; /* Can we merge the block to our big extent? */ if (lblk == map->m_lblk + map->m_len && (bh->b_state & BH_FLAGS) == map->m_flags) { map->m_len++; return true; } return false; } /* * mpage_process_page_bufs - submit page buffers for IO or add them to extent * * @mpd - extent of blocks for mapping * @head - the first buffer in the page * @bh - buffer we should start processing from * @lblk - logical number of the block in the file corresponding to @bh * * Walk through page buffers from @bh upto @head (exclusive) and either submit * the page for IO if all buffers in this page were mapped and there's no * accumulated extent of buffers to map or add buffers in the page to the * extent of buffers to map. The function returns 1 if the caller can continue * by processing the next page, 0 if it should stop adding buffers to the * extent to map because we cannot extend it anymore. It can also return value * < 0 in case of error during IO submission. */ static int mpage_process_page_bufs(struct mpage_da_data *mpd, struct buffer_head *head, struct buffer_head *bh, ext4_lblk_t lblk) { struct inode *inode = mpd->inode; int err; ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) >> inode->i_blkbits; if (ext4_verity_in_progress(inode)) blocks = EXT_MAX_BLOCKS; do { BUG_ON(buffer_locked(bh)); if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { /* Found extent to map? */ if (mpd->map.m_len) return 0; /* Buffer needs mapping and handle is not started? */ if (!mpd->do_map) return 0; /* Everything mapped so far and we hit EOF */ break; } } while (lblk++, (bh = bh->b_this_page) != head); /* So far everything mapped? Submit the page for IO. */ if (mpd->map.m_len == 0) { err = mpage_submit_page(mpd, head->b_page); if (err < 0) return err; } if (lblk >= blocks) { mpd->scanned_until_end = 1; return 0; } return 1; } /* * mpage_process_page - update page buffers corresponding to changed extent and * may submit fully mapped page for IO * * @mpd - description of extent to map, on return next extent to map * @m_lblk - logical block mapping. * @m_pblk - corresponding physical mapping. * @map_bh - determines on return whether this page requires any further * mapping or not. * Scan given page buffers corresponding to changed extent and update buffer * state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits. * If the given page is not fully mapped, we update @map to the next extent in * the given page that needs mapping & return @map_bh as true. */ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, bool *map_bh) { struct buffer_head *head, *bh; ext4_io_end_t *io_end = mpd->io_submit.io_end; ext4_lblk_t lblk = *m_lblk; ext4_fsblk_t pblock = *m_pblk; int err = 0; int blkbits = mpd->inode->i_blkbits; ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); bh = head = page_buffers(page); do { if (lblk < mpd->map.m_lblk) continue; if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { /* * Buffer after end of mapped extent. * Find next buffer in the page to map. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; io_end_vec->size += io_end_size; io_end_size = 0; err = mpage_process_page_bufs(mpd, head, bh, lblk); if (err > 0) err = 0; if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) { err = PTR_ERR(io_end_vec); goto out; } io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; } *map_bh = true; goto out; } if (buffer_delay(bh)) { clear_buffer_delay(bh); bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); io_end_size += (1 << blkbits); } while (lblk++, (bh = bh->b_this_page) != head); io_end_vec->size += io_end_size; io_end_size = 0; *map_bh = false; out: *m_lblk = lblk; *m_pblk = pblock; return err; } /* * mpage_map_buffers - update buffers corresponding to changed extent and * submit fully mapped pages for IO * * @mpd - description of extent to map, on return next extent to map * * Scan buffers corresponding to changed extent (we expect corresponding pages * to be already locked) and update buffer state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits, * and mark buffers as uninit when we perform writes to unwritten extents * and do extent conversion after IO is finished. If the last page is not fully * mapped, we update @map to the next extent in the last page that needs * mapping. Otherwise we submit the page for IO. */ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) { struct pagevec pvec; int nr_pages, i; struct inode *inode = mpd->inode; int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; ext4_fsblk_t pblock; int err; bool map_bh = false; start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; lblk = start << bpp_bits; pblock = mpd->map.m_pblk; pagevec_init(&pvec); while (start <= end) { nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &start, end); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; err = mpage_process_page(mpd, page, &lblk, &pblock, &map_bh); /* * If map_bh is true, means page may require further bh * mapping, or maybe the page was submitted for IO. * So we return to call further extent mapping. */ if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_page(mpd, page); if (err < 0) goto out; } pagevec_release(&pvec); } /* Extent fully mapped and matches with page boundary. We are done. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; return 0; out: pagevec_release(&pvec); return err; } static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) { struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int get_blocks_flags; int err, dioread_nolock; trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or * to convert an unwritten extent to be initialized (in the case * where we have written into one or more preallocated blocks). It is * possible that we're going to need more metadata blocks than * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if * possible. * * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if * the blocks in question are delalloc blocks. This indicates * that the blocks and quotas has already been checked when * the data was copied into the page cache. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_GET_BLOCKS_IO_SUBMIT; dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; if (map->m_flags & BIT(BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) return err; if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { if (!mpd->io_submit.io_end->handle && ext4_handle_valid(handle)) { mpd->io_submit.io_end->handle = handle->h_rsv_handle; handle->h_rsv_handle = NULL; } ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); } BUG_ON(map->m_len == 0); return 0; } /* * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length * mpd->len and submit pages underlying it for IO * * @handle - handle for journal operations * @mpd - extent to map * @give_up_on_write - we set this to true iff there is a fatal error and there * is no hope of writing the data. The caller should discard * dirty pages to avoid infinite loops. * * The function maps extent starting at mpd->lblk of length mpd->len. If it is * delayed, blocks are allocated, if it is unwritten, we may need to convert * them to initialized or split the described range from larger unwritten * extent. Note that we need not map all the described range since allocation * can return less blocks or the range is covered by more unwritten extents. We * cannot map more because we are limited by reserved transaction credits. On * the other hand we always make sure that the last touched page is fully * mapped so that it can be written out (and thus forward progress is * guaranteed). After mapping we submit all mapped pages for IO. */ static int mpage_map_and_submit_extent(handle_t *handle, struct mpage_da_data *mpd, bool *give_up_on_write) { struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int err; loff_t disksize; int progress = 0; ext4_io_end_t *io_end = mpd->io_submit.io_end; struct ext4_io_end_vec *io_end_vec; io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) return PTR_ERR(io_end_vec); io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { struct super_block *sb = inode->i_sb; if (ext4_forced_shutdown(EXT4_SB(sb)) || ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. * In the case of ENOSPC, if ext4_count_free_blocks() * is non-zero, a commit should free up blocks. */ if ((err == -ENOMEM) || (err == -ENOSPC && ext4_count_free_clusters(sb))) { if (progress) goto update_disksize; return err; } ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " "inode %lu at logical offset %llu with" " max blocks %u with error %d", inode->i_ino, (unsigned long long)map->m_lblk, (unsigned)map->m_len, -err); ext4_msg(sb, KERN_CRIT, "This should not happen!! Data will " "be lost\n"); if (err == -ENOSPC) ext4_print_free_blocks(inode); invalidate_dirty_pages: *give_up_on_write = true; return err; } progress = 1; /* * Update buffer state, submit mapped pages, and get us new * extent to map */ err = mpage_map_and_submit_buffers(mpd); if (err < 0) goto update_disksize; } while (map->m_len); update_disksize: /* * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. */ disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { int err2; loff_t i_size; down_write(&EXT4_I(inode)->i_data_sem); i_size = i_size_read(inode); if (disksize > i_size) disksize = i_size; if (disksize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = disksize; up_write(&EXT4_I(inode)->i_data_sem); err2 = ext4_mark_inode_dirty(handle, inode); if (err2) { ext4_error_err(inode->i_sb, -err2, "Failed to mark inode %lu dirty", inode->i_ino); } if (!err) err = err2; } return err; } /* * Calculate the total number of credits to reserve for one writepages * iteration. This is called from ext4_writepages(). We map an extent of * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + * bpp - 1 blocks in bpp different extents. */ static int ext4_da_writepages_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_page(inode); return ext4_meta_trans_blocks(inode, MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } /* * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages * and underlying extent to map * * @mpd - where to look for pages * * Walk dirty pages in the mapping. If they are fully mapped, submit them for * IO immediately. When we find a page which isn't mapped we start accumulating * extent of buffers underlying these pages that needs mapping (formed by * either delayed or unwritten buffers). We also lock the pages containing * these buffers. The extent found is returned in @mpd structure (starting at * mpd->lblk with length mpd->len blocks). * * Note that this function can attach bios to one io_end structure which are * neither logically nor physically contiguous. Although it may seem as an * unnecessary complication, it is actually inevitable in blocksize < pagesize * case as we need to track IO to all buffers underlying a page in one io_end. */ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; struct pagevec pvec; unsigned int nr_pages; long left = mpd->wbc->nr_to_write; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; xa_mark_t tag; int i, err = 0; int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; pagevec_init(&pvec); mpd->map.m_len = 0; mpd->next_page = index; while (index <= end) { nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, tag); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* * Accumulated enough dirty pages? This doesn't apply * to WB_SYNC_ALL mode. For integrity sync we have to * keep going because someone may be concurrently * dirtying pages, and we might have synced a lot of * newly appeared dirty pages, but have not synced all * of the old dirty pages. */ if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) goto out; /* If we can't merge this page, we are done. */ if (mpd->map.m_len > 0 && mpd->next_page != page->index) goto out; lock_page(page); /* * If the page is no longer dirty, or its mapping no * longer corresponds to inode we are writing (which * means it has been truncated or invalidated), or the * page is already under writeback and we are not doing * a data integrity writeback, skip the page */ if (!PageDirty(page) || (PageWriteback(page) && (mpd->wbc->sync_mode == WB_SYNC_NONE)) || unlikely(page->mapping != mapping)) { unlock_page(page); continue; } wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); /* * Should never happen but for buggy code in * other subsystems that call * set_page_dirty() without properly warning * the file system first. See [1] for more * information. * * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz */ if (!page_has_buffers(page)) { ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index); ClearPageDirty(page); unlock_page(page); continue; } if (mpd->map.m_len == 0) mpd->first_page = page->index; mpd->next_page = page->index + 1; /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)page->index) << (PAGE_SHIFT - blkbits); head = page_buffers(page); err = mpage_process_page_bufs(mpd, head, head, lblk); if (err <= 0) goto out; err = 0; left--; } pagevec_release(&pvec); cond_resched(); } mpd->scanned_until_end = 1; return 0; out: pagevec_release(&pvec); return err; } static int ext4_writepages(struct address_space *mapping, struct writeback_control *wbc) { pgoff_t writeback_index = 0; long nr_to_write = wbc->nr_to_write; int range_whole = 0; int cycled = 1; handle_t *handle = NULL; struct mpage_da_data mpd; struct inode *inode = mapping->host; int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); struct blk_plug plug; bool give_up_on_write = false; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; percpu_down_read(&sbi->s_writepages_rwsem); trace_ext4_writepages(inode, wbc); /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() * because that could violate lock ordering on umount */ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) goto out_writepages; if (ext4_should_journal_data(inode)) { ret = generic_writepages(mapping, wbc); goto out_writepages; } /* * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) || ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) { ret = -EROFS; goto out_writepages; } /* * If we have inline data and arrive here, it means that * we will soon create the block for the 1st page, so * we'd better clear the inline data here. */ if (ext4_has_inline_data(inode)) { /* Just inode will be modified... */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_writepages; } BUG_ON(ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)); ext4_destroy_inline_data(handle, inode); ext4_journal_stop(handle); } if (ext4_should_dioread_nolock(inode)) { /* * We may need to convert up to one extent per block in * the page and we may dirty the inode. */ rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, PAGE_SIZE >> inode->i_blkbits); } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; if (writeback_index) cycled = 0; mpd.first_page = writeback_index; mpd.last_page = -1; } else { mpd.first_page = wbc->range_start >> PAGE_SHIFT; mpd.last_page = wbc->range_end >> PAGE_SHIFT; } mpd.inode = inode; mpd.wbc = wbc; ext4_io_submit_init(&mpd.io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); blk_start_plug(&plug); /* * First writeback pages that don't need mapping - we can avoid * starting a transaction unnecessarily and also avoid being blocked * in the block layer on device congestion while having transaction * started. */ mpd.do_map = 0; mpd.scanned_until_end = 0; mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd.io_submit.io_end) { ret = -ENOMEM; goto unplug; } ret = mpage_prepare_extent_to_map(&mpd); /* Unlock pages we didn't use */ mpage_release_unused_pages(&mpd, false); /* Submit prepared bio */ ext4_io_submit(&mpd.io_submit); ext4_put_io_end_defer(mpd.io_submit.io_end); mpd.io_submit.io_end = NULL; if (ret < 0) goto unplug; while (!mpd.scanned_until_end && wbc->nr_to_write > 0) { /* For each extent of pages we use new io_end */ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd.io_submit.io_end) { ret = -ENOMEM; break; } /* * We have two constraints: We find one extent to map and we * must always write out whole page (makes a difference when * blocksize < pagesize) so that we don't block on IO when we * try to write out the rest of the page. Journalled mode is * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); needed_blocks = ext4_da_writepages_trans_blocks(inode); /* start a new transaction */ handle = ext4_journal_start_with_reserve(inode, EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ ext4_put_io_end(mpd.io_submit.io_end); mpd.io_submit.io_end = NULL; break; } mpd.do_map = 1; trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); ret = mpage_prepare_extent_to_map(&mpd); if (!ret && mpd.map.m_len) ret = mpage_map_and_submit_extent(handle, &mpd, &give_up_on_write); /* * Caution: If the handle is synchronous, * ext4_journal_stop() can wait for transaction commit * to finish which may depend on writeback of pages to * complete or on page lock to be released. In that * case, we have to wait until after we have * submitted all the IO, released page locks we hold, * and dropped io_end reference (for extent conversion * to be able to complete) before stopping the handle. */ if (!ext4_handle_valid(handle) || handle->h_sync == 0) { ext4_journal_stop(handle); handle = NULL; mpd.do_map = 0; } /* Unlock pages we didn't use */ mpage_release_unused_pages(&mpd, give_up_on_write); /* Submit prepared bio */ ext4_io_submit(&mpd.io_submit); /* * Drop our io_end reference we got from init. We have * to be careful and use deferred io_end finishing if * we are still holding the transaction as we can * release the last reference to io_end which may end * up doing unwritten extent conversion. */ if (handle) { ext4_put_io_end_defer(mpd.io_submit.io_end); ext4_journal_stop(handle); } else ext4_put_io_end(mpd.io_submit.io_end); mpd.io_submit.io_end = NULL; if (ret == -ENOSPC && sbi->s_journal) { /* * Commit the transaction which would * free blocks released in the transaction * and try again */ jbd2_journal_force_commit_nested(sbi->s_journal); ret = 0; continue; } /* Fatal error - ENOMEM, EIO... */ if (ret) break; } unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; mpd.last_page = writeback_index - 1; mpd.first_page = 0; goto retry; } /* Update index */ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* * Set the writeback_index so that range_cyclic * mode will write it back later */ mapping->writeback_index = mpd.first_page; out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); percpu_up_read(&sbi->s_writepages_rwsem); return ret; } static int ext4_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; percpu_down_read(&sbi->s_writepages_rwsem); trace_ext4_writepages(inode, wbc); ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); percpu_up_read(&sbi->s_writepages_rwsem); return ret; } static int ext4_nonda_switch(struct super_block *sb) { s64 free_clusters, dirty_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * switch to non delalloc mode if we are running low * on free block. The free block accounting via percpu * counters can get slightly wrong with percpu_counter_batch getting * accumulated on each CPU without updating global counters * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. */ free_clusters = percpu_counter_read_positive(&sbi->s_freeclusters_counter); dirty_clusters = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); /* * Start pushing delalloc when 1/2 of free blocks are dirty. */ if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); if (2 * free_clusters < 3 * dirty_clusters || free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark */ return 1; } return 0; } static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { int ret, retries = 0; struct page *page; pgoff_t index; struct inode *inode = mapping->host; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; index = pos >> PAGE_SHIFT; if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, len, flags, pagep, fsdata); } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len, flags); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, flags, pagep, fsdata); if (ret < 0) return ret; if (ret == 1) return 0; } retry: page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); #ifdef CONFIG_FS_ENCRYPTION ret = ext4_block_write_begin(page, pos, len, ext4_da_get_block_prep); #else ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); #endif if (ret < 0) { unlock_page(page); put_page(page); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold inode lock. */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret; } *pagep = page; return ret; } /* * Check if we should update i_disksize * when write to the end of file but not require block allocation */ static int ext4_da_should_update_i_disksize(struct page *page, unsigned long offset) { struct buffer_head *bh; struct inode *inode = page->mapping->host; unsigned int idx; int i; bh = page_buffers(page); idx = offset >> inode->i_blkbits; for (i = 0; i < idx; i++) bh = bh->b_this_page; if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) return 0; return 1; } static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = mapping->host; loff_t new_i_size; unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(file, mapping, pos, len, copied, page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); if (write_mode != CONVERT_INLINE_DATA && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, page); if (unlikely(copied < len) && !PageUptodate(page)) copied = 0; start = pos & (PAGE_SIZE - 1); end = start + copied - 1; /* * Since we are holding inode lock, we are sure i_disksize <= * i_size. We also know that if i_disksize < i_size, there are * delalloc writes pending in the range upto i_size. If the end of * the current write is <= i_size, there's no need to touch * i_disksize since writeback will push i_disksize upto i_size * eventually. If the end of the current write is > i_size and * inside an allocated block (ext4_da_should_update_i_disksize() * check), we need to update i_disksize here as neither * ext4_writepage() nor certain ext4_writepages() paths not * allocating blocks update i_disksize. * * Note that we defer inode dirtying to generic_write_end() / * ext4_da_write_inline_data_end(). */ new_i_size = pos + copied; if (copied && new_i_size > inode->i_size && ext4_da_should_update_i_disksize(page, end)) ext4_update_i_disksize(inode, new_i_size); return generic_write_end(file, mapping, pos, len, copied, page, fsdata); } /* * Force all delayed allocation blocks to be allocated for a given inode. */ int ext4_alloc_da_blocks(struct inode *inode) { trace_ext4_alloc_da_blocks(inode); if (!EXT4_I(inode)->i_reserved_data_blocks) return 0; /* * We do something simple for now. The filemap_flush() will * also start triggering a write of the data blocks, which is * not strictly speaking necessary (and for users of * laptop_mode, not even desirable). However, to do otherwise * would require replicating code paths in: * * ext4_writepages() -> * write_cache_pages() ---> (via passed in callback function) * __mpage_da_writepage() --> * mpage_add_bh_to_extent() * mpage_da_map_blocks() * * The problem is that write_cache_pages(), located in * mm/page-writeback.c, marks pages clean in preparation for * doing I/O, which is not desirable if we're not planning on * doing I/O at all. * * We could call write_cache_pages(), and then redirty all of * the pages by calling redirty_page_for_writepage() but that * would be ugly in the extreme. So instead we would need to * replicate parts of the code in the above functions, * simplifying them because we wouldn't actually intend to * write out the pages, but rather only collect contiguous * logical block extents, call the multi-block allocator, and * then update the buffer heads with the block allocations. * * For now, though, we'll cheat by calling filemap_flush(), * which will map the blocks, and start the I/O, but not * actually wait for the I/O to complete. */ return filemap_flush(inode->i_mapping); } /* * bmap() is special. It gets used by applications such as lilo and by * the swapper to find the on-disk block of a specific piece of data. * * Naturally, this is dangerous if the block concerned is still in the * journal. If somebody makes a swapfile on an ext4 data-journaling * filesystem and enables swap, then they may get a nasty shock when the * data getting swapped to that swapfile suddenly gets overwritten by * the original zero's written out previously to the journal and * awaiting writeback in the kernel's buffer cache. * * So, if we see any bmap calls here on a modified, data-journaled file, * take extra steps to flush any blocks which might be in the cache. */ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; journal_t *journal; sector_t ret = 0; int err; inode_lock_shared(inode); /* * We can get here for an inline file via the FIBMAP ioctl */ if (ext4_has_inline_data(inode)) goto out; if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && test_opt(inode->i_sb, DELALLOC)) { /* * With delalloc we want to sync the file * so that we can make sure we allocate * blocks for file */ filemap_write_and_wait(mapping); } if (EXT4_JOURNAL(inode) && ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { /* * This is a REALLY heavyweight approach, but the use of * bmap on dirty files is expected to be extremely rare: * only if we run lilo or swapon on a freshly made file * do we expect this to happen. * * (bmap requires CAP_SYS_RAWIO so this does not * represent an unprivileged user DOS attack --- we'd be * in trouble if mortal users could trigger this path at * will.) * * NB. EXT4_STATE_JDATA is not set on files other than * regular files. If somebody wants to bmap a directory * or symlink and gets confused because the buffer * hasn't yet been flushed to disk, they deserve * everything they get. */ ext4_clear_inode_state(inode, EXT4_STATE_JDATA); journal = EXT4_JOURNAL(inode); jbd2_journal_lock_updates(journal); err = jbd2_journal_flush(journal, 0); jbd2_journal_unlock_updates(journal); if (err) goto out; } ret = iomap_bmap(mapping, block, &ext4_iomap_ops); out: inode_unlock_shared(inode); return ret; } static int ext4_readpage(struct file *file, struct page *page) { int ret = -EAGAIN; struct inode *inode = page->mapping->host; trace_ext4_readpage(page); if (ext4_has_inline_data(inode)) ret = ext4_readpage_inline(inode, page); if (ret == -EAGAIN) return ext4_mpage_readpages(inode, NULL, page); return ret; } static void ext4_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; /* If the file has inline data, no need to do readahead. */ if (ext4_has_inline_data(inode)) return; ext4_mpage_readpages(inode, rac, NULL); } static void ext4_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { trace_ext4_invalidatepage(page, offset, length); /* No journalling happens on data buffers when this function is used */ WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); block_invalidatepage(page, offset, length); } static int __ext4_journalled_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); trace_ext4_journalled_invalidatepage(page, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ if (offset == 0 && length == PAGE_SIZE) ClearPageChecked(page); return jbd2_journal_invalidatepage(journal, page, offset, length); } /* Wrapper for aops... */ static void ext4_journalled_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0); } static int ext4_releasepage(struct page *page, gfp_t wait) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); trace_ext4_releasepage(page); /* Page has dirty journalled data -> cannot release */ if (PageChecked(page)) return 0; if (journal) return jbd2_journal_try_to_free_buffers(journal, page); else return try_to_free_buffers(page); } static bool ext4_inode_datasync_dirty(struct inode *inode) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; if (journal) { if (jbd2_transaction_committed(journal, EXT4_I(inode)->i_datasync_tid)) return false; if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) return !list_empty(&EXT4_I(inode)->i_fc_list); return true; } /* Any metadata buffers to write? */ if (!list_empty(&inode->i_mapping->private_list)) return true; return inode->i_state & I_DIRTY_DATASYNC; } static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, struct ext4_map_blocks *map, loff_t offset, loff_t length) { u8 blkbits = inode->i_blkbits; /* * Writes that span EOF might trigger an I/O size update on completion, * so consider them to be dirty for the purpose of O_DSYNC, even if * there is no other metadata changes being made or are pending. */ iomap->flags = 0; if (ext4_inode_datasync_dirty(inode) || offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; iomap->bdev = inode->i_sb->s_bdev; iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; iomap->offset = (u64) map->m_lblk << blkbits; iomap->length = (u64) map->m_len << blkbits; if ((map->m_flags & EXT4_MAP_MAPPED) && !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) iomap->flags |= IOMAP_F_MERGED; /* * Flags passed to ext4_map_blocks() for direct I/O writes can result * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits * set. In order for any allocated unwritten extents to be converted * into written extents correctly within the ->end_io() handler, we * need to ensure that the iomap->type is set appropriately. Hence, the * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has * been set first. */ if (map->m_flags & EXT4_MAP_UNWRITTEN) { iomap->type = IOMAP_UNWRITTEN; iomap->addr = (u64) map->m_pblk << blkbits; } else if (map->m_flags & EXT4_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; iomap->addr = (u64) map->m_pblk << blkbits; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; } } static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; /* * Trim the mapping request to the maximum value that we can map at * once for direct I/O. */ if (map->m_len > DIO_MAX_BLOCKS) map->m_len = DIO_MAX_BLOCKS; dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); retry: /* * Either we allocate blocks and then don't get an unwritten extent, so * in that case we have reserved enough credits. Or, the blocks are * already allocated and unwritten. In that case, the extent conversion * fits into the credits as well. */ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); if (IS_ERR(handle)) return PTR_ERR(handle); /* * DAX and direct I/O are the only two operations that are currently * supported with IOMAP_WRITE. */ WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT)); if (IS_DAX(inode)) m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; /* * We use i_size instead of i_disksize here because delalloc writeback * can complete at any point during the I/O and subsequently push the * i_disksize out to i_size. This could be beyond where direct I/O is * happening and thus expose allocated blocks to direct I/O reads. */ else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; ret = ext4_map_blocks(handle, inode, map, m_flags); /* * We cannot fill holes in indirect tree based inodes as that could * expose stale data in the case of a crash. Use the magic error code * to fallback to buffered I/O. */ if (!m_flags && !ret) ret = -ENOTBLK; ext4_journal_stop(handle); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret; } static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; if (WARN_ON_ONCE(ext4_has_inline_data(inode))) return -ERANGE; /* * Calculate the first and last logical blocks respectively. */ map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; if (flags & IOMAP_WRITE) { /* * We check here if the blocks are already allocated, then we * don't need to start a journal txn and we can directly return * the mapping information. This could boost performance * especially in multi-threaded overwrite requests. */ if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED)) goto out; } ret = ext4_iomap_alloc(inode, &map, flags); } else { ret = ext4_map_blocks(NULL, inode, &map, 0); } if (ret < 0) return ret; out: ext4_set_iomap(inode, iomap, &map, offset, length); return 0; } static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int ret; /* * Even for writes we don't need to allocate blocks, so just pretend * we are reading to save overhead of starting a transaction. */ flags &= ~IOMAP_WRITE; ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED); return ret; } static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { /* * Check to see whether an error occurred while writing out the data to * the allocated blocks. If so, return the magic error code so that we * fallback to buffered I/O and attempt to complete the remainder of * the I/O. Any blocks that may have been allocated in preparation for * the direct I/O will be reused during buffered I/O. */ if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) return -ENOTBLK; return 0; } const struct iomap_ops ext4_iomap_ops = { .iomap_begin = ext4_iomap_begin, .iomap_end = ext4_iomap_end, }; const struct iomap_ops ext4_iomap_overwrite_ops = { .iomap_begin = ext4_iomap_overwrite_begin, .iomap_end = ext4_iomap_end, }; static bool ext4_iomap_is_delalloc(struct inode *inode, struct ext4_map_blocks *map) { struct extent_status es; ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; ext4_es_find_extent_range(inode, &ext4_es_is_delayed, map->m_lblk, end, &es); if (!es.es_len || es.es_lblk > end) return false; if (es.es_lblk > map->m_lblk) { map->m_len = es.es_lblk - map->m_lblk; return false; } offset = map->m_lblk - es.es_lblk; map->m_len = es.es_len - offset; return true; } static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; bool delalloc = false; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; if (ext4_has_inline_data(inode)) { ret = ext4_inline_data_iomap(inode, iomap); if (ret != -EAGAIN) { if (ret == 0 && offset >= iomap->length) ret = -ENOENT; return ret; } } /* * Calculate the first and last logical block respectively. */ map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; /* * Fiemap callers may call for offset beyond s_bitmap_maxbytes. * So handle it here itself instead of querying ext4_map_blocks(). * Since ext4_map_blocks() will warn about it and will return * -EIO error. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (offset >= sbi->s_bitmap_maxbytes) { map.m_flags = 0; goto set_iomap; } } ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; if (ret == 0) delalloc = ext4_iomap_is_delalloc(inode, &map); set_iomap: ext4_set_iomap(inode, iomap, &map, offset, length); if (delalloc && iomap->type == IOMAP_HOLE) iomap->type = IOMAP_DELALLOC; return 0; } const struct iomap_ops ext4_iomap_report_ops = { .iomap_begin = ext4_iomap_begin_report, }; /* * Pages can be marked dirty completely asynchronously from ext4's journalling * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do * much here because ->set_page_dirty is called under VFS locks. The page is * not necessarily locked. * * We cannot just dirty the page and leave attached buffers clean, because the * buffers' dirty state is "definitive". We cannot just set the buffers dirty * or jbddirty because all the journalling code will explode. * * So what we do is to mark the page "pending dirty" and next time writepage * is called, propagate that into the buffers appropriately. */ static int ext4_journalled_set_page_dirty(struct page *page) { SetPageChecked(page); return __set_page_dirty_nobuffers(page); } static int ext4_set_page_dirty(struct page *page) { WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page)); WARN_ON_ONCE(!page_has_buffers(page)); return __set_page_dirty_buffers(page); } static int ext4_iomap_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { return iomap_swapfile_activate(sis, file, span, &ext4_iomap_report_ops); } static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, .set_page_dirty = ext4_set_page_dirty, .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_journalled_aops = { .readpage = ext4_readpage, .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, .set_page_dirty = ext4_journalled_set_page_dirty, .bmap = ext4_bmap, .invalidatepage = ext4_journalled_invalidatepage, .releasepage = ext4_releasepage, .direct_IO = noop_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_da_aops = { .readpage = ext4_readpage, .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, .set_page_dirty = ext4_set_page_dirty, .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .direct_IO = noop_direct_IO, .set_page_dirty = __set_page_dirty_no_writeback, .bmap = ext4_bmap, .invalidatepage = noop_invalidatepage, .swap_activate = ext4_iomap_swap_activate, }; void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { case EXT4_INODE_ORDERED_DATA_MODE: case EXT4_INODE_WRITEBACK_DATA_MODE: break; case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; return; default: BUG(); } if (IS_DAX(inode)) inode->i_mapping->a_ops = &ext4_dax_aops; else if (test_opt(inode->i_sb, DELALLOC)) inode->i_mapping->a_ops = &ext4_da_aops; else inode->i_mapping->a_ops = &ext4_aops; } static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_SHIFT; unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; struct page *page; int err = 0; page = find_or_create_page(mapping, from >> PAGE_SHIFT, mapping_gfp_constraint(mapping, ~__GFP_FS)); if (!page) return -ENOMEM; blocksize = inode->i_sb->s_blocksize; iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); /* Find the buffer that contains "offset" */ bh = page_buffers(page); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; iblock++; pos += blocksize; } if (buffer_freed(bh)) { BUFFER_TRACE(bh, "freed: skip"); goto unlock; } if (!buffer_mapped(bh)) { BUFFER_TRACE(bh, "unmapped"); ext4_get_block(inode, iblock, bh, 0); /* unmapped? It's a hole - nothing to do */ if (!buffer_mapped(bh)) { BUFFER_TRACE(bh, "still unmapped"); goto unlock; } } /* Ok, it's mapped. Make sure it's up-to-date */ if (PageUptodate(page)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { err = ext4_read_bh_lock(bh, 0, true); if (err) goto unlock; if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); err = fscrypt_decrypt_pagecache_blocks(page, blocksize, bh_offset(bh)); if (err) { clear_buffer_uptodate(bh); goto unlock; } } } if (ext4_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto unlock; } zero_user(page, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); if (ext4_should_journal_data(inode)) { err = ext4_handle_dirty_metadata(handle, inode, bh); } else { err = 0; mark_buffer_dirty(bh); if (ext4_should_order_data(inode)) err = ext4_jbd2_inode_add_write(handle, inode, from, length); } unlock: unlock_page(page); put_page(page); return err; } /* * ext4_block_zero_page_range() zeros out a mapping of length 'length' * starting from file offset 'from'. The range to be zero'd must * be contained with in one block. If the specified range exceeds * the end of the block it will be shortened to end of the block * that corresponds to 'from' */ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { struct inode *inode = mapping->host; unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize = inode->i_sb->s_blocksize; unsigned max = blocksize - (offset & (blocksize - 1)); /* * correct length if it does not fall between * 'from' and the end of the block */ if (length > max || length < 0) length = max; if (IS_DAX(inode)) { return iomap_zero_range(inode, from, length, NULL, &ext4_iomap_ops); } return __ext4_block_zero_page_range(handle, mapping, from, length); } /* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ static int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { unsigned offset = from & (PAGE_SIZE-1); unsigned length; unsigned blocksize; struct inode *inode = mapping->host; /* If we are processing an encrypted inode during orphan list handling */ if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); return ext4_block_zero_page_range(handle, mapping, from, length); } int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t length) { struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; unsigned partial_start, partial_end; ext4_fsblk_t start, end; loff_t byte_end = (lstart + length - 1); int err = 0; partial_start = lstart & (sb->s_blocksize - 1); partial_end = byte_end & (sb->s_blocksize - 1); start = lstart >> sb->s_blocksize_bits; end = byte_end >> sb->s_blocksize_bits; /* Handle partial zero within the single block */ if (start == end && (partial_start || (partial_end != sb->s_blocksize - 1))) { err = ext4_block_zero_page_range(handle, mapping, lstart, length); return err; } /* Handle partial zero out on the start of the range */ if (partial_start) { err = ext4_block_zero_page_range(handle, mapping, lstart, sb->s_blocksize); if (err) return err; } /* Handle partial zero out on the end of the range */ if (partial_end != sb->s_blocksize - 1) err = ext4_block_zero_page_range(handle, mapping, byte_end - partial_end, partial_end + 1); return err; } int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) return 1; if (S_ISLNK(inode->i_mode)) return !ext4_inode_is_fast_symlink(inode); return 0; } /* * We have to make sure i_disksize gets properly updated before we truncate * page cache due to hole punching or zero range. Otherwise i_disksize update * can get lost as it may have been postponed to submission of writeback but * that will never happen after we truncate page cache. */ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len) { handle_t *handle; int ret; loff_t size = i_size_read(inode); WARN_ON(!inode_is_locked(inode)); if (offset > size || offset + len < size) return 0; if (EXT4_I(inode)->i_disksize >= size) return 0; handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_update_i_disksize(inode, size); ret = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); return ret; } static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); schedule(); filemap_invalidate_lock(inode->i_mapping); } int ext4_break_layouts(struct inode *inode) { struct page *page; int error; if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) return -EINVAL; do { page = dax_layout_busy_page(inode->i_mapping); if (!page) return 0; error = ___wait_var_event(&page->_refcount, atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, 0, 0, ext4_wait_dax_page(inode)); } while (error == 0); return error; } /* * ext4_punch_hole: punches a hole in a file by releasing the blocks * associated with the given offset and length * * @inode: File inode * @offset: The offset where the hole will begin * @len: The length of the hole * * Returns: 0 on success or negative on failure */ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; loff_t first_block_offset, last_block_offset, max_length; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; unsigned int credits; int ret = 0, ret2 = 0; trace_ext4_punch_hole(inode, offset, length, 0); /* * Write out all dirty pages to avoid race conditions * Then release them. */ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { ret = filemap_write_and_wait_range(mapping, offset, offset + length - 1); if (ret) return ret; } inode_lock(inode); /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) goto out_mutex; /* * If the hole extends beyond i_size, set the hole * to end after the page that contains i_size */ if (offset + length > inode->i_size) { length = inode->i_size + PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) - offset; } /* * For punch hole the length + offset needs to be within one block * before last range. Adjust the length if it goes beyond that limit. */ max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; if (offset + length > max_length) length = max_length - offset; if (offset & (sb->s_blocksize - 1) || (offset + length) & (sb->s_blocksize - 1)) { /* * Attach jinode to inode for jbd2 if we do any zeroing of * partial block */ ret = ext4_inode_attach_jinode(inode); if (ret < 0) goto out_mutex; } /* Wait all existing dio workers, newcomers will block on i_mutex */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out_mutex; /* * Prevent page faults from reinstantiating pages we have released from * page cache. */ filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) goto out_dio; first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; /* Now release the pages and zero block aligned part of pages*/ if (last_block_offset > first_block_offset) { ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) goto out_dio; truncate_pagecache_range(inode, first_block_offset, last_block_offset); } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(sb, ret); goto out_dio; } ret = ext4_zero_partial_blocks(handle, inode, offset, length); if (ret) goto out_stop; first_block = (offset + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); /* If there are blocks to remove, do it */ if (stop_block > first_block) { down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); ret = ext4_es_remove_extent(inode, first_block, stop_block - first_block); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_remove_space(inode, first_block, stop_block - 1); else ret = ext4_ind_remove_space(handle, inode, first_block, stop_block); up_write(&EXT4_I(inode)->i_data_sem); } ext4_fc_track_range(handle, inode, first_block, stop_block); if (IS_SYNC(inode)) ext4_handle_sync(handle); inode->i_mtime = inode->i_ctime = current_time(inode); ret2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret2)) ret = ret2; if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_dio: filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; } int ext4_inode_attach_jinode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct jbd2_inode *jinode; if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) return 0; jinode = jbd2_alloc_inode(GFP_KERNEL); spin_lock(&inode->i_lock); if (!ei->jinode) { if (!jinode) { spin_unlock(&inode->i_lock); return -ENOMEM; } ei->jinode = jinode; jbd2_journal_init_jbd_inode(ei->jinode, inode); jinode = NULL; } spin_unlock(&inode->i_lock); if (unlikely(jinode != NULL)) jbd2_free_inode(jinode); return 0; } /* * ext4_truncate() * * We block out ext4_get_block() block instantiations across the entire * transaction, and VFS/VM ensures that ext4_truncate() cannot run * simultaneously on behalf of the same inode. * * As we work through the truncate and commit bits of it to the journal there * is one core, guiding principle: the file's tree must always be consistent on * disk. We must be able to restart the truncate after a crash. * * The file's tree may be transiently inconsistent in memory (although it * probably isn't), but whenever we close off and commit a journal transaction, * the contents of (the filesystem + the journal) must be consistent and * restartable. It's pretty simple, really: bottom up, right to left (although * left-to-right works OK too). * * Note that at recovery time, journal replay occurs *before* the restart of * truncate against the orphan inode list. * * The committed inode has the new, desired i_size (which is the same as * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see * that this inode's truncate did not complete and it will again call * ext4_truncate() to have another go. So there will be instantiated blocks * to the right of the truncation point in a crashed ext4 filesystem. But * that's fine - as long as they are linked from the inode, the post-crash * ext4_truncate() run will find them and release them. */ int ext4_truncate(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int credits; int err = 0, err2; handle_t *handle; struct address_space *mapping = inode->i_mapping; /* * There is a possibility that we're either freeing the inode * or it's a completely new inode. In those cases we might not * have i_mutex locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) goto out_trace; if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); if (ext4_has_inline_data(inode)) { int has_inline = 1; err = ext4_inline_data_truncate(inode, &has_inline); if (err || has_inline) goto out_trace; } /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { err = ext4_inode_attach_jinode(inode); if (err) goto out_trace; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out_trace; } if (inode->i_size & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, mapping, inode->i_size); /* * We add the inode to the orphan list, so that if this * truncate spans multiple transactions, and we crash, we will * resume the truncate when the filesystem recovers. It also * marks the inode dirty, to catch the new size. * * Implication: the file must always be in a sane, consistent * truncatable state while each transaction commits. */ err = ext4_orphan_add(handle, inode); if (err) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) err = ext4_ext_truncate(handle, inode); else ext4_ind_truncate(handle, inode); up_write(&ei->i_data_sem); if (err) goto out_stop; if (IS_SYNC(inode)) ext4_handle_sync(handle); out_stop: /* * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. * However, if this was a real unlink then we were called by * ext4_evict_inode(), and we allow that function to clean up the * orphan info for us. */ if (inode->i_nlink) ext4_orphan_del(handle, inode); inode->i_mtime = inode->i_ctime = current_time(inode); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; ext4_journal_stop(handle); out_trace: trace_ext4_truncate_exit(inode); return err; } /* * ext4_get_inode_loc returns with an extra refcount against the inode's * underlying buffer_head on success. If 'in_mem' is true, we have all * data in memory that is needed to recreate the on-disk version of this * inode. */ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc, int in_mem, ext4_fsblk_t *ret_block) { struct ext4_group_desc *gdp; struct buffer_head *bh; ext4_fsblk_t block; struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; if (ino < EXT4_ROOT_INO || ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) return -EFSCORRUPTED; iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); if (!gdp) return -EIO; /* * Figure out the offset within the block group inode table */ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)); iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); block = ext4_inode_table(sb, gdp); if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) || (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) { ext4_error(sb, "Invalid inode table block %llu in " "block_group %u", block, iloc->block_group); return -EFSCORRUPTED; } block += (inode_offset / inodes_per_block); bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; if (ext4_buffer_uptodate(bh)) goto has_buffer; lock_buffer(bh); if (ext4_buffer_uptodate(bh)) { /* Someone brought it uptodate while we waited */ unlock_buffer(bh); goto has_buffer; } /* * If we have all information of the inode in memory and this * is the only valid inode in the block, we need not read the * block. */ if (in_mem) { struct buffer_head *bitmap_bh; int i, start; start = inode_offset & ~(inodes_per_block - 1); /* Is the inode bitmap in cache? */ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); if (unlikely(!bitmap_bh)) goto make_io; /* * If the inode bitmap isn't in cache then the * optimisation may end up performing two reads instead * of one, so skip it. */ if (!buffer_uptodate(bitmap_bh)) { brelse(bitmap_bh); goto make_io; } for (i = start; i < start + inodes_per_block; i++) { if (i == inode_offset) continue; if (ext4_test_bit(i, bitmap_bh->b_data)) break; } brelse(bitmap_bh); if (i == start + inodes_per_block) { /* all other inodes are free, so skip I/O */ memset(bh->b_data, 0, bh->b_size); set_buffer_uptodate(bh); unlock_buffer(bh); goto has_buffer; } } make_io: /* * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; table = ext4_inode_table(sb, gdp); /* s_inode_readahead_blks is always a power of 2 */ b = block & ~((ext4_fsblk_t) ra_blks - 1); if (table > b) b = table; end = b + ra_blks; num = EXT4_INODES_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) num -= ext4_itable_unused_count(sb, gdp); table += num / inodes_per_block; if (end > table) end = table; while (b <= end) ext4_sb_breadahead_unmovable(sb, b++); } /* * There are other valid inodes in the buffer, this inode * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ trace_ext4_load_inode(sb, ino); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); blk_finish_plug(&plug); wait_on_buffer(bh); ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); if (!buffer_uptodate(bh)) { if (ret_block) *ret_block = block; brelse(bh); return -EIO; } has_buffer: iloc->bh = bh; return 0; } static int __ext4_get_inode_loc_noinmem(struct inode *inode, struct ext4_iloc *iloc) { ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, 0, &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, "unable to read itable block"); return ret; } int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) { ext4_fsblk_t err_blk = 0; int ret; /* We have all inode data except xattrs in memory here. */ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, !ext4_test_inode_state(inode, EXT4_STATE_XATTR), &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, "unable to read itable block"); return ret; } int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc) { return __ext4_get_inode_loc(sb, ino, iloc, 0, NULL); } static bool ext4_should_enable_dax(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (test_opt2(inode->i_sb, DAX_NEVER)) return false; if (!S_ISREG(inode->i_mode)) return false; if (ext4_should_journal_data(inode)) return false; if (ext4_has_inline_data(inode)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) return false; if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) return false; if (test_opt(inode->i_sb, DAX_ALWAYS)) return true; return ext4_test_inode_flag(inode, EXT4_INODE_DAX); } void ext4_set_inode_flags(struct inode *inode, bool init) { unsigned int flags = EXT4_I(inode)->i_flags; unsigned int new_fl = 0; WARN_ON_ONCE(IS_DAX(inode) && init); if (flags & EXT4_SYNC_FL) new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) new_fl |= S_APPEND; if (flags & EXT4_IMMUTABLE_FL) new_fl |= S_IMMUTABLE; if (flags & EXT4_NOATIME_FL) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; /* Because of the way inode_set_flags() works we must preserve S_DAX * here if already set. */ new_fl |= (inode->i_flags & S_DAX); if (init && ext4_should_enable_dax(inode)) new_fl |= S_DAX; if (flags & EXT4_ENCRYPT_FL) new_fl |= S_ENCRYPTED; if (flags & EXT4_CASEFOLD_FL) new_fl |= S_CASEFOLD; if (flags & EXT4_VERITY_FL) new_fl |= S_VERITY; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| S_ENCRYPTED|S_CASEFOLD|S_VERITY); } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { blkcnt_t i_blocks ; struct inode *inode = &(ei->vfs_inode); struct super_block *sb = inode->i_sb; if (ext4_has_feature_huge_file(sb)) { /* we are using combined 48 bit field */ i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | le32_to_cpu(raw_inode->i_blocks_lo); if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { /* i_blocks represent file system block size */ return i_blocks << (inode->i_blkbits - 9); } else { return i_blocks; } } else { return le32_to_cpu(raw_inode->i_blocks_lo); } } static inline int ext4_iget_extra_inode(struct inode *inode, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (EXT4_INODE_HAS_XATTR_SPACE(inode) && *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { int err; ext4_set_inode_state(inode, EXT4_STATE_XATTR); err = ext4_find_inline_data_nolock(inode); if (!err && ext4_has_inline_data(inode)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return err; } else EXT4_I(inode)->i_inline_off = 0; return 0; } int ext4_get_projid(struct inode *inode, kprojid_t *projid) { if (!ext4_has_feature_project(inode->i_sb)) return -EOPNOTSUPP; *projid = EXT4_I(inode)->i_projid; return 0; } /* * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag * set. */ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) inode_set_iversion_raw(inode, val); else inode_set_iversion_queried(inode, val); } static inline u64 ext4_inode_peek_iversion(const struct inode *inode) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return inode_peek_iversion_raw(inode); else return inode_peek_iversion(inode); } static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, const char *function, unsigned int line) { const char *err_str; if (flags & EXT4_IGET_EA_INODE) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { err_str = "missing EA_INODE flag"; goto error; } if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || EXT4_I(inode)->i_file_acl) { err_str = "ea_inode with extended attributes"; goto error; } } else { if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { /* * open_by_handle_at() could provide an old inode number * that has since been reused for an ea_inode; this does * not indicate filesystem corruption */ if (flags & EXT4_IGET_HANDLE) return -ESTALE; err_str = "unexpected EA_INODE flag"; goto error; } } if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { err_str = "unexpected bad inode w/o EXT4_IGET_BAD"; goto error; } return 0; error: ext4_error_inode(inode, function, line, 0, err_str); return -EFSCORRUPTED; } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) { struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; int block; uid_t i_uid; gid_t i_gid; projid_t i_projid; if ((!(flags & EXT4_IGET_SPECIAL) && ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || ino == le32_to_cpu(es->s_usr_quota_inum) || ino == le32_to_cpu(es->s_grp_quota_inum) || ino == le32_to_cpu(es->s_prj_quota_inum) || ino == le32_to_cpu(es->s_orphan_file_inum))) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) return ERR_PTR(-ESTALE); __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, "inode #%lu: comm %s: iget: illegal inode #", ino, current->comm); return ERR_PTR(-EFSCORRUPTED); } inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { ret = check_igot_inode(inode, flags, function, line); if (ret) { iput(inode); return ERR_PTR(ret); } return inode; } ei = EXT4_I(inode); iloc.bh = NULL; ret = __ext4_get_inode_loc_noinmem(inode, &iloc); if (ret < 0) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); if ((flags & EXT4_IGET_HANDLE) && (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { ret = -ESTALE; goto bad_inode; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb) || (ei->i_extra_isize & 3)) { ext4_error_inode(inode, function, line, 0, "iget: bad extra_isize %u " "(inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; goto bad_inode; } } else ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) && (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) { ext4_error_inode_err(inode, function, line, 0, EFSBADCRC, "iget: checksum invalid"); ret = -EFSBADCRC; goto bad_inode; } inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if (ext4_has_feature_project(sb) && EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); else i_projid = EXT4_DEF_PROJID; if (!(test_opt(inode->i_sb, NO_UID32))) { i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. * This is needed because nfsd might try to access dead inodes * the test is that same one that e2fsck uses * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && ino != EXT4_BOOT_LOADER_INO) { /* this inode is deleted or unallocated */ if (flags & EXT4_IGET_SPECIAL) { ext4_error_inode(inode, function, line, 0, "iget: special inode unallocated"); ret = -EFSCORRUPTED; } else ret = -ESTALE; goto bad_inode; } /* The only unlinked inodes we let through here have * valid i_mode and are being read by the orphan * recovery code: that's fine, we're about to complete * the process of deleting those. * OR it is the EXT4_BOOT_LOADER_INO which is * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); ext4_set_inode_flags(inode, true); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); if ((size = i_size_read(inode)) < 0) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; goto bad_inode; } /* * If dir_index is not enabled but there's dir with INDEX flag set, * we'd normally treat htree data as empty space. But with metadata * checksumming that corrupts checksums so forbid that. */ if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ext4_error_inode(inode, function, line, 0, "iget: Dir with htree data on filesystem without dir_index feature."); ret = -EFSCORRUPTED; goto bad_inode; } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; #endif inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; ei->i_last_alloc_group = ~0; /* * NOTE! The in-memory inode i_data array is in little-endian order * even on big-endian machines: we do NOT byteswap the block numbers! */ for (block = 0; block < EXT4_N_BLOCKS; block++) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); ext4_fc_init_inode(&ei->vfs_inode); /* * Set transaction id's of transactions that have to be committed * to finish f[data]sync. We set them to currently running transaction * as we cannot be sure that the inode or some of its metadata isn't * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ if (journal) { transaction_t *transaction; tid_t tid; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) transaction = journal->j_running_transaction; else transaction = journal->j_committing_transaction; if (transaction) tid = transaction->t_tid; else tid = journal->j_commit_sequence; read_unlock(&journal->j_state_lock); ei->i_sync_tid = tid; ei->i_datasync_tid = tid; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { ret = ext4_iget_extra_inode(inode, raw_inode, ei); if (ret) goto bad_inode; } } EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = le32_to_cpu(raw_inode->i_disk_version); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) ivers |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } ext4_inode_set_iversion_queried(inode, ivers); } ret = 0; if (ei->i_file_acl && !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { ext4_error_inode(inode, function, line, 0, "iget: bad extended attribute block %llu", ei->i_file_acl); ret = -EFSCORRUPTED; goto bad_inode; } else if (!ext4_has_inline_data(inode)) { /* validate the block references in the inode */ if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode)))) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_check_inode(inode); else ret = ext4_ind_check_inode(inode); } } if (ret) goto bad_inode; if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { /* VFS does not allow setting these so must be corruption */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { ext4_error_inode(inode, function, line, 0, "iget: immutable or append flags " "not allowed on symlinks"); ret = -EFSCORRUPTED; goto bad_inode; } if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; ext4_set_aops(inode); } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_link = (char *)ei->i_data; inode->i_op = &ext4_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); } else { inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); } inode_nohighmem(inode); } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &ext4_special_inode_operations; if (raw_inode->i_block[0]) init_special_inode(inode, inode->i_mode, old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else if (ino == EXT4_BOOT_LOADER_INO) { make_bad_inode(inode); } else { ret = -EFSCORRUPTED; ext4_error_inode(inode, function, line, 0, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); ret = -EFSCORRUPTED; goto bad_inode; } ret = check_igot_inode(inode, flags, function, line); /* * -ESTALE here means there is nothing inherently wrong with the inode, * it's just not an inode we can return for an fhandle lookup. */ if (ret == -ESTALE) { brelse(iloc.bh); unlock_new_inode(inode); iput(inode); return ERR_PTR(-ESTALE); } if (ret) goto bad_inode; brelse(iloc.bh); unlock_new_inode(inode); return inode; bad_inode: brelse(iloc.bh); iget_failed(inode); return ERR_PTR(ret); } static int ext4_inode_blocks_set(handle_t *handle, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { struct inode *inode = &(ei->vfs_inode); u64 i_blocks = READ_ONCE(inode->i_blocks); struct super_block *sb = inode->i_sb; if (i_blocks <= ~0U) { /* * i_blocks can be represented in a 32 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = 0; ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); return 0; } /* * This should never happen since sb->s_maxbytes should not have * allowed this, sb->s_maxbytes was set according to the huge_file * feature in ext4_fill_super(). */ if (!ext4_has_feature_huge_file(sb)) return -EFSCORRUPTED; if (i_blocks <= 0xffffffffffffULL) { /* * i_blocks can be represented in a 48 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); } else { ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); /* i_block is stored in file system block size */ i_blocks = i_blocks >> (inode->i_blkbits - 9); raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); } return 0; } static void __ext4_update_other_inode_time(struct super_block *sb, unsigned long orig_ino, unsigned long ino, struct ext4_inode *raw_inode) { struct inode *inode; inode = find_inode_by_ino_rcu(sb, ino); if (!inode) return; if (!inode_is_dirtytime_only(inode)) return; spin_lock(&inode->i_lock); if (inode_is_dirtytime_only(inode)) { struct ext4_inode_info *ei = EXT4_I(inode); inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); trace_ext4_other_inode_update_time(inode, orig_ino); return; } spin_unlock(&inode->i_lock); } /* * Opportunistically update the other time fields for other inodes in * the same inode table block. */ static void ext4_update_other_inodes_time(struct super_block *sb, unsigned long orig_ino, char *buf) { unsigned long ino; int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; int inode_size = EXT4_INODE_SIZE(sb); /* * Calculate the first inode in the inode table block. Inode * numbers are one-based. That is, the first inode in a block * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). */ ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; rcu_read_lock(); for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { if (ino == orig_ino) continue; __ext4_update_other_inode_time(sb, orig_ino, ino, (struct ext4_inode *)buf); } rcu_read_unlock(); } /* * Post the struct inode info into an on-disk inode location in the * buffer-cache. This gobbles the caller's reference to the * buffer_head in the inode location struct. * * The caller must have write access to iloc->bh. */ static int ext4_do_update_inode(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; struct super_block *sb = inode->i_sb; int err = 0, block; int need_datasync = 0, set_large_file = 0; uid_t i_uid; gid_t i_gid; projid_t i_projid; spin_lock(&ei->i_raw_lock); /* * For fields not tracked in the in-memory inode, initialise them * to zero for new inodes. */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); err = ext4_inode_blocks_set(handle, raw_inode, ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); i_projid = from_kprojid(&init_user_ns, ei->i_projid); if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, * old inodes get re-used with the upper 16 bits of the * uid/gid intact. */ if (ei->i_dtime && list_empty(&ei->i_orphan)) { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } else { raw_inode->i_uid_high = cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = cpu_to_le16(high_16_bits(i_gid)); } } else { raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) { ext4_isize_set(raw_inode, ei->i_disksize); need_datasync = 1; } if (ei->i_disksize > 0x7fffffffULL) { if (!ext4_has_feature_large_file(sb) || EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV)) set_large_file = 1; } raw_inode->i_generation = cpu_to_le32(inode->i_generation); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { raw_inode->i_block[0] = cpu_to_le32(old_encode_dev(inode->i_rdev)); raw_inode->i_block[1] = 0; } else { raw_inode->i_block[0] = 0; raw_inode->i_block[1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); raw_inode->i_block[2] = 0; } } else if (!ext4_has_inline_data(inode)) { for (block = 0; block < EXT4_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; } if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = ext4_inode_peek_iversion(inode); raw_inode->i_disk_version = cpu_to_le32(ivers); if (ei->i_extra_isize) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) raw_inode->i_version_hi = cpu_to_le32(ivers >> 32); raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); } } if (i_projid != EXT4_DEF_PROJID && !ext4_has_feature_project(inode->i_sb)) err = err ?: -EFSCORRUPTED; if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) raw_inode->i_projid = cpu_to_le32(i_projid); ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); if (err) { EXT4_ERROR_INODE(inode, "corrupted inode contents"); goto out_brelse; } if (inode->i_sb->s_flags & SB_LAZYTIME) ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) goto out_error; ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE); if (err) goto out_error; lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_large_file(sb); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_handle_sync(handle); err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } ext4_update_inode_fsync_trans(handle, inode, need_datasync); out_error: ext4_std_error(inode->i_sb, err); out_brelse: brelse(bh); return err; } /* * ext4_write_inode() * * We are called from a few places: * * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * * - Within flush work (sys_sync(), kupdate and such). * We wait on commit, if told to. * * - Within iput_final() -> write_inode_now() * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in * which we are interested. * * It would be a bug for them to not do this. The code: * * mark_inode_dirty(inode) * stuff(); * inode->i_size = expr; * * is in error because write_inode() could occur while `stuff()' is running, * and the new i_size will be lost. Plus the inode will no longer be on the * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) || sb_rdonly(inode->i_sb)) return 0; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { ext4_debug("called recursively, non-PF_MEMALLOC!\n"); dump_stack(); return -EIO; } /* * No need to force transaction in WB_SYNC_NONE mode. Also * ext4_sync_fs() will force the commit after everything is * written. */ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); } else { struct ext4_iloc iloc; err = __ext4_get_inode_loc_noinmem(inode, &iloc); if (err) return err; /* * sync(2) will flush the whole buffer cache. No need to do * it here separately for each inode. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO, "IO error syncing inode"); err = -EIO; } brelse(iloc.bh); } return err; } /* * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate * buffers that are attached to a page stradding i_size and are undergoing * commit. In that case we have to wait for commit to finish and try again. */ static void ext4_wait_for_tail_page_commit(struct inode *inode) { struct page *page; unsigned offset; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid; int ret; bool has_transaction; offset = inode->i_size & (PAGE_SIZE - 1); /* * If the page is fully truncated, we don't need to wait for any commit * (and we even should not as __ext4_journalled_invalidatepage() may * strip all buffers from the page but keep the page dirty which can then * confuse e.g. concurrent ext4_writepage() seeing dirty page without * buffers). Also we don't need to wait for any commit if all buffers in * the page remain valid. This is most beneficial for the common case of * blocksize == PAGESIZE. */ if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) return; while (1) { page = find_lock_page(inode->i_mapping, inode->i_size >> PAGE_SHIFT); if (!page) return; ret = __ext4_journalled_invalidatepage(page, offset, PAGE_SIZE - offset); unlock_page(page); put_page(page); if (ret != -EBUSY) return; has_transaction = false; read_lock(&journal->j_state_lock); if (journal->j_committing_transaction) { commit_tid = journal->j_committing_transaction->t_tid; has_transaction = true; } read_unlock(&journal->j_state_lock); if (has_transaction) jbd2_log_wait_commit(journal, commit_tid); } } /* * ext4_setattr() * * Called from notify_change. * * We want to trap VFS attempts to truncate the file as soon as * possible. In particular, we want to make sure that when the VFS * shrinks i_size, we put the inode on the orphan list and modify * i_disksize immediately, so that during the subsequent flushing of * dirty pages and freeing of disk blocks, we can guarantee that any * commit will leave the blocks being flushed in an unused state on * disk. (On recovery, the inode will get truncated and the blocks will * be freed, so we have a strong guarantee that no future commit will * leave these blocks visible to the user.) * * Another thing we have to assure is that if we are in ordered mode * and inode is still attached to the committing transaction, we must * we start writeout of all the dirty pages which are being truncated. * This way we are sure that all the data written in the previous * transaction are already on disk (truncate waits for pages under * writeback). * * Called with inode->i_mutex down. */ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error, rc = 0; int orphan = 0; const unsigned int ia_valid = attr->ia_valid; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; if (unlikely(IS_APPEND(inode) && (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)))) return -EPERM; error = setattr_prepare(mnt_userns, dentry, attr); if (error) return error; error = fscrypt_prepare_setattr(dentry, attr); if (error) return error; error = fsverity_prepare_setattr(dentry, attr); if (error) return error; if (is_quota_modification(inode, attr)) { error = dquot_initialize(inode); if (error) return error; } if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; } /* dquot_transfer() calls back ext4_get_inode_usage() which * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); error = dquot_transfer(inode, attr); up_read(&EXT4_I(inode)->xattr_sem); if (error) { ext4_journal_stop(handle); return error; } /* Update corresponding info in inode so that everything is in * one transaction */ if (attr->ia_valid & ATTR_UID) inode->i_uid = attr->ia_uid; if (attr->ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { return error; } } if (attr->ia_valid & ATTR_SIZE) { handle_t *handle; loff_t oldsize = inode->i_size; loff_t old_disksize; int shrink = (attr->ia_size < inode->i_size); if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (attr->ia_size > sbi->s_bitmap_maxbytes) { return -EFBIG; } } if (!S_ISREG(inode->i_mode)) { return -EINVAL; } if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) inode_inc_iversion(inode); if (shrink) { if (ext4_should_order_data(inode)) { error = ext4_begin_ordered_truncate(inode, attr->ia_size); if (error) goto err_out; } /* * Blocks are going to be removed from the inode. Wait * for dio in flight. */ inode_dio_wait(inode); } filemap_invalidate_lock(inode->i_mapping); rc = ext4_break_layouts(inode); if (rc) { filemap_invalidate_unlock(inode->i_mapping); goto err_out; } if (attr->ia_size != inode->i_size) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto out_mmap_sem; } if (ext4_handle_valid(handle) && shrink) { error = ext4_orphan_add(handle, inode); orphan = 1; } /* * Update c/mtime on truncate up, ext4_truncate() will * update c/mtime in shrink case below */ if (!shrink) { inode->i_mtime = current_time(inode); inode->i_ctime = inode->i_mtime; } if (shrink) ext4_fc_track_range(handle, inode, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits, EXT_MAX_BLOCKS - 1); else ext4_fc_track_range( handle, inode, (oldsize > 0 ? oldsize - 1 : oldsize) >> inode->i_sb->s_blocksize_bits, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits); down_write(&EXT4_I(inode)->i_data_sem); old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; rc = ext4_mark_inode_dirty(handle, inode); if (!error) error = rc; /* * We have to update i_size under i_data_sem together * with i_disksize to avoid races with writeback code * running ext4_wb_update_i_disksize(). */ if (!error) i_size_write(inode, attr->ia_size); else EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); ext4_journal_stop(handle); if (error) goto out_mmap_sem; if (!shrink) { pagecache_isize_extended(inode, oldsize, inode->i_size); } else if (ext4_should_journal_data(inode)) { ext4_wait_for_tail_page_commit(inode); } } /* * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. */ truncate_pagecache(inode, inode->i_size); /* * Call ext4_truncate() even if i_size didn't change to * truncate possible preallocated blocks. */ if (attr->ia_size <= oldsize) { rc = ext4_truncate(inode); if (rc) error = rc; } out_mmap_sem: filemap_invalidate_unlock(inode->i_mapping); } if (!error) { setattr_copy(mnt_userns, inode, attr); mark_inode_dirty(inode); } /* * If the call to ext4_truncate failed to get a transaction handle at * all, we need to clean up the in-core orphan list manually. */ if (orphan && inode->i_nlink) ext4_orphan_del(NULL, inode); if (!error && (ia_valid & ATTR_MODE)) rc = posix_acl_chmod(mnt_userns, inode, inode->i_mode); err_out: if (error) ext4_std_error(inode->i_sb, error); if (!error) error = rc; return error; } int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ext4_inode *raw_inode; struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; if ((request_mask & STATX_BTIME) && EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; stat->btime.tv_nsec = ei->i_crtime.tv_nsec; } flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (flags & EXT4_COMPR_FL) stat->attributes |= STATX_ATTR_COMPRESSED; if (flags & EXT4_ENCRYPT_FL) stat->attributes |= STATX_ATTR_ENCRYPTED; if (flags & EXT4_IMMUTABLE_FL) stat->attributes |= STATX_ATTR_IMMUTABLE; if (flags & EXT4_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; if (flags & EXT4_VERITY_FL) stat->attributes |= STATX_ATTR_VERITY; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP | STATX_ATTR_VERITY); generic_fillattr(mnt_userns, inode, stat); return 0; } int ext4_file_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); u64 delalloc_blocks; ext4_getattr(mnt_userns, path, stat, request_mask, query_flags); /* * If there is inline data in the inode, the inode will normally not * have data blocks allocated (it may have an external xattr block). * Report at least one sector for such files, so tools like tar, rsync, * others don't incorrectly think the file is completely sparse. */ if (unlikely(ext4_has_inline_data(inode))) stat->blocks += (stat->size + 511) >> 9; /* * We can't update i_blocks if the block allocation is delayed * otherwise in the case of system crash before the real block * allocation is done, we will have i_blocks inconsistent with * on-disk file blocks. * We always keep i_blocks updated together with real * allocation. But to not confuse with user, stat * will return the blocks that include the delayed allocation * blocks for this file. */ delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), EXT4_I(inode)->i_reserved_data_blocks); stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9); return 0; } static int ext4_index_trans_blocks(struct inode *inode, int lblocks, int pextents) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return ext4_ind_trans_blocks(inode, lblocks); return ext4_ext_index_trans_blocks(inode, pextents); } /* * Account for index blocks, block groups bitmaps and block group * descriptor blocks if modify datablocks and index blocks * worse case, the indexs blocks spread over different block groups * * If datablocks are discontiguous, they are possible to spread over * different block groups too. If they are contiguous, with flexbg, * they could still across block group boundary. * * Also account for superblock, inode, quota and xattr blocks */ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; int idxblocks; int ret = 0; /* * How many index blocks need to touch to map @lblocks logical blocks * to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); ret = idxblocks; /* * Now let's see how many group bitmaps and group descriptors need * to account */ groups = idxblocks + pextents; gdpblocks = groups; if (groups > ngroups) groups = ngroups; if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; /* bitmaps and block group descriptor blocks */ ret += groups + gdpblocks; /* Blocks for super block, inode, quota and xattr blocks */ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); return ret; } /* * Calculate the total number of credits to reserve to fit * the modification of a single pages into a single transaction, * which may include multiple chunks of block allocations. * * This could be called via ext4_write_begin() * * We need to consider the worse case, when * one new block per extent. */ int ext4_writepage_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_page(inode); int ret; ret = ext4_meta_trans_blocks(inode, bpp, bpp); /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) ret += bpp; return ret; } /* * Calculate the journal credits for a chunk of data modification. * * This is called from DIO, fallocate or whoever calling * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. * * journal buffers for data blocks are not included here, as DIO * and fallocate do no need to journal data buffers. */ int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) { return ext4_meta_trans_blocks(inode, nrblocks, 1); } /* * The caller must have previously called ext4_reserve_inode_write(). * Give this, we know that the caller already has write access to iloc->bh. */ int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int err = 0; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { put_bh(iloc->bh); return -EIO; } ext4_fc_track_inode(handle, inode); /* * ea_inodes are using i_version for storing reference count, don't * mess with it */ if (IS_I_VERSION(inode) && !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) inode_inc_iversion(inode); /* the do_update_inode consumes one bh->b_count */ get_bh(iloc->bh); /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ err = ext4_do_update_inode(handle, inode, iloc); put_bh(iloc->bh); return err; } /* * On success, We end up with an outstanding reference count against * iloc->bh. This _must_ be cleaned up later. */ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int err; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; err = ext4_get_inode_loc(inode, iloc); if (!err) { BUFFER_TRACE(iloc->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, EXT4_JTR_NONE); if (err) { brelse(iloc->bh); iloc->bh = NULL; } } ext4_std_error(inode->i_sb, err); return err; } static int __ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc *iloc, handle_t *handle, int *no_expand) { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); int error; /* this was checked at iget time, but double check for good measure */ if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) || (ei->i_extra_isize & 3)) { EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); return -EFSCORRUPTED; } if ((new_extra_isize < ei->i_extra_isize) || (new_extra_isize < 4) || (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE)) return -EINVAL; /* Should never happen */ raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); /* No extended attributes present */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize, 0, new_extra_isize - EXT4_I(inode)->i_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; return 0; } /* * We may need to allocate external xattr block so we need quotas * initialized. Here we can be called with various locks held so we * cannot affort to initialize quotas ourselves. So just bail. */ if (dquot_initialize_needed(inode)) return -EAGAIN; /* try to expand with EAs present */ error = ext4_expand_extra_isize_ea(inode, new_extra_isize, raw_inode, handle); if (error) { /* * Inode size expansion failed; don't try again */ *no_expand = 1; } return error; } /* * Expand an inode by new_extra_isize bytes. * Returns 0 on success or negative error number on failure. */ static int ext4_try_to_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc iloc, handle_t *handle) { int no_expand; int error; if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) return -EOVERFLOW; /* * In nojournal mode, we can immediately attempt to expand * the inode. When journaled, we first need to obtain extra * buffer credits since we may write into the EA block * with this same handle. If journal_extend fails, then it will * only result in a minor loss of functionality for that inode. * If this is felt to be critical, then e2fsck should be run to * force a large enough s_min_extra_isize. */ if (ext4_journal_extend(handle, EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0) return -ENOSPC; if (ext4_write_trylock_xattr(inode, &no_expand) == 0) return -EBUSY; error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc, handle, &no_expand); ext4_write_unlock_xattr(inode, &no_expand); return error; } int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc *iloc) { handle_t *handle; int no_expand; int error, rc; if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { brelse(iloc->bh); return -EOVERFLOW; } handle = ext4_journal_start(inode, EXT4_HT_INODE, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { error = PTR_ERR(handle); brelse(iloc->bh); return error; } ext4_write_lock_xattr(inode, &no_expand); BUFFER_TRACE(iloc->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, EXT4_JTR_NONE); if (error) { brelse(iloc->bh); goto out_unlock; } error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, handle, &no_expand); rc = ext4_mark_iloc_dirty(handle, inode, iloc); if (!error) error = rc; out_unlock: ext4_write_unlock_xattr(inode, &no_expand); ext4_journal_stop(handle); return error; } /* * What we do here is to mark the in-core inode as clean with respect to inode * dirtiness (it may still be data-dirty). * This means that the in-core inode may be reaped by prune_icache * without having to perform any I/O. This is a very good thing, * because *any* task may call prune_icache - even ones which * have a transaction open against a different journal. * * Is this cheating? Not really. Sure, we haven't written the * inode out, but prune_icache isn't a user-visible syncing function. * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) * we start and wait on commits. */ int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, const char *func, unsigned int line) { struct ext4_iloc iloc; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err; might_sleep(); trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out; if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize) ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize, iloc, handle); err = ext4_mark_iloc_dirty(handle, inode, &iloc); out: if (unlikely(err)) ext4_error_inode_err(inode, func, line, 0, err, "mark_inode_dirty error"); return err; } /* * ext4_dirty_inode() is called from __mark_inode_dirty() * * We're really interested in the case where a file is being extended. * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * * Also, dquot_alloc_block() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. */ void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return; ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); } int ext4_change_inode_journal_flag(struct inode *inode, int val) { journal_t *journal; handle_t *handle; int err; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); /* * We have to be very careful here: changing a data block's * journaling status dynamically is dangerous. If we write a * data block to the journal, change the status and then delete * that block, we risk forgetting to revoke the old log record * from the journal and so a subsequent replay can corrupt data. * So, first we make sure that the journal is empty and that * nobody is changing anything. */ journal = EXT4_JOURNAL(inode); if (!journal) return 0; if (is_journal_aborted(journal)) return -EROFS; /* Wait for all existing dio workers */ inode_dio_wait(inode); /* * Before flushing the journal and switching inode's aops, we have * to flush all dirty data the inode has. There can be outstanding * delayed allocations, there can be unwritten extents created by * fallocate or buffered writes in dioread_nolock mode covered by * dirty data which can be converted only after flushing the dirty * data (and journalled aops don't know how to handle these cases). */ if (val) { filemap_invalidate_lock(inode->i_mapping); err = filemap_write_and_wait(inode->i_mapping); if (err < 0) { filemap_invalidate_unlock(inode->i_mapping); return err; } } percpu_down_write(&sbi->s_writepages_rwsem); jbd2_journal_lock_updates(journal); /* * OK, there are no updates running now, and all cached data is * synced to disk. We are now in a completely consistent state * which doesn't have anything in the journal, and we know that * no filesystem updates are running, so it is safe to modify * the inode's in-core data-journaling state flag now. */ if (val) ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); else { err = jbd2_journal_flush(journal, 0); if (err < 0) { jbd2_journal_unlock_updates(journal); percpu_up_write(&sbi->s_writepages_rwsem); return err; } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); percpu_up_write(&sbi->s_writepages_rwsem); if (val) filemap_invalidate_unlock(inode->i_mapping); /* Finally we can mark the inode as dirty. */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle); err = ext4_mark_inode_dirty(handle, inode); ext4_handle_sync(handle); ext4_journal_stop(handle); ext4_std_error(inode->i_sb, err); return err; } static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, struct buffer_head *bh) { return !buffer_mapped(bh); } vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = vmf->page; loff_t size; unsigned long len; int err; vm_fault_t ret; struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; handle_t *handle; get_block_t *get_block; int retries = 0; if (unlikely(IS_IMMUTABLE(inode))) return VM_FAULT_SIGBUS; sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); filemap_invalidate_lock_shared(mapping); err = ext4_convert_inline_data(inode); if (err) goto out_ret; /* * On data journalling we skip straight to the transaction handle: * there's no delalloc; page truncated will be checked later; the * early return w/ all buffers mapped (calculates size/len) can't * be used; and there's no dioread_nolock, so only ext4_get_block. */ if (ext4_should_journal_data(inode)) goto retry_alloc; /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb)) { do { err = block_page_mkwrite(vma, vmf, ext4_da_get_block_prep); } while (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); goto out_ret; } lock_page(page); size = i_size_read(inode); /* Page got truncated from under us? */ if (page->mapping != mapping || page_offset(page) > size) { unlock_page(page); ret = VM_FAULT_NOPAGE; goto out; } if (page->index == size >> PAGE_SHIFT) len = size & ~PAGE_MASK; else len = PAGE_SIZE; /* * Return if we have all the buffers mapped. This avoids the need to do * journal_start/journal_stop which can block and take a long time * * This cannot be done for data journalling, as we have to add the * inode to the transaction's list to writeprotect pages on commit. */ if (page_has_buffers(page)) { if (!ext4_walk_page_buffers(NULL, inode, page_buffers(page), 0, len, NULL, ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ wait_for_stable_page(page); ret = VM_FAULT_LOCKED; goto out; } } unlock_page(page); /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) get_block = ext4_get_block_unwritten; else get_block = ext4_get_block; retry_alloc: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = VM_FAULT_SIGBUS; goto out; } /* * Data journalling can't use block_page_mkwrite() because it * will set_buffer_dirty() before do_journal_get_write_access() * thus might hit warning messages for dirty metadata buffers. */ if (!ext4_should_journal_data(inode)) { err = block_page_mkwrite(vma, vmf, get_block); } else { lock_page(page); size = i_size_read(inode); /* Page got truncated from under us? */ if (page->mapping != mapping || page_offset(page) > size) { ret = VM_FAULT_NOPAGE; goto out_error; } if (page->index == size >> PAGE_SHIFT) len = size & ~PAGE_MASK; else len = PAGE_SIZE; err = __block_write_begin(page, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; if (ext4_walk_page_buffers(handle, inode, page_buffers(page), 0, len, NULL, do_journal_get_write_access)) goto out_error; if (ext4_walk_page_buffers(handle, inode, page_buffers(page), 0, len, NULL, write_end_fn)) goto out_error; if (ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len)) goto out_error; ext4_set_inode_state(inode, EXT4_STATE_JDATA); } else { unlock_page(page); } } ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_alloc; out_ret: ret = block_page_mkwrite_return(err); out: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); return ret; out_error: unlock_page(page); ext4_journal_stop(handle); goto out; } |
21 2 2 12 8 21 10 14 3 2 1 4 2 7 1 6 1 1 6 4 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 | // SPDX-License-Identifier: GPL-2.0-or-later /* Large capacity key type * * Copyright (C) 2017-2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "big_key: "fmt #include <linux/init.h> #include <linux/seq_file.h> #include <linux/file.h> #include <linux/shmem_fs.h> #include <linux/err.h> #include <linux/random.h> #include <keys/user-type.h> #include <keys/big_key-type.h> #include <crypto/chacha20poly1305.h> /* * Layout of key payload words. */ enum { big_key_data, big_key_path, big_key_path_2nd_part, big_key_len, }; /* * If the data is under this limit, there's no point creating a shm file to * hold it as the permanently resident metadata for the shmem fs will be at * least as large as the data. */ #define BIG_KEY_FILE_THRESHOLD (sizeof(struct inode) + sizeof(struct dentry)) /* * big_key defined keys take an arbitrary string as the description and an * arbitrary blob of data as the payload */ struct key_type key_type_big_key = { .name = "big_key", .preparse = big_key_preparse, .free_preparse = big_key_free_preparse, .instantiate = generic_key_instantiate, .revoke = big_key_revoke, .destroy = big_key_destroy, .describe = big_key_describe, .read = big_key_read, .update = big_key_update, }; /* * Preparse a big key */ int big_key_preparse(struct key_preparsed_payload *prep) { struct path *path = (struct path *)&prep->payload.data[big_key_path]; struct file *file; u8 *buf, *enckey; ssize_t written; size_t datalen = prep->datalen; size_t enclen = datalen + CHACHA20POLY1305_AUTHTAG_SIZE; int ret; if (datalen <= 0 || datalen > 1024 * 1024 || !prep->data) return -EINVAL; /* Set an arbitrary quota */ prep->quotalen = 16; prep->payload.data[big_key_len] = (void *)(unsigned long)datalen; if (datalen > BIG_KEY_FILE_THRESHOLD) { /* Create a shmem file to store the data in. This will permit the data * to be swapped out if needed. * * File content is stored encrypted with randomly generated key. * Since the key is random for each file, we can set the nonce * to zero, provided we never define a ->update() call. */ loff_t pos = 0; buf = kvmalloc(enclen, GFP_KERNEL); if (!buf) return -ENOMEM; /* generate random key */ enckey = kmalloc(CHACHA20POLY1305_KEY_SIZE, GFP_KERNEL); if (!enckey) { ret = -ENOMEM; goto error; } ret = get_random_bytes_wait(enckey, CHACHA20POLY1305_KEY_SIZE); if (unlikely(ret)) goto err_enckey; /* encrypt data */ chacha20poly1305_encrypt(buf, prep->data, datalen, NULL, 0, 0, enckey); /* save aligned data to file */ file = shmem_kernel_file_setup("", enclen, 0); if (IS_ERR(file)) { ret = PTR_ERR(file); goto err_enckey; } written = kernel_write(file, buf, enclen, &pos); if (written != enclen) { ret = written; if (written >= 0) ret = -EIO; goto err_fput; } /* Pin the mount and dentry to the key so that we can open it again * later */ prep->payload.data[big_key_data] = enckey; *path = file->f_path; path_get(path); fput(file); kvfree_sensitive(buf, enclen); } else { /* Just store the data in a buffer */ void *data = kmalloc(datalen, GFP_KERNEL); if (!data) return -ENOMEM; prep->payload.data[big_key_data] = data; memcpy(data, prep->data, prep->datalen); } return 0; err_fput: fput(file); err_enckey: kfree_sensitive(enckey); error: kvfree_sensitive(buf, enclen); return ret; } /* * Clear preparsement. */ void big_key_free_preparse(struct key_preparsed_payload *prep) { if (prep->datalen > BIG_KEY_FILE_THRESHOLD) { struct path *path = (struct path *)&prep->payload.data[big_key_path]; path_put(path); } kfree_sensitive(prep->payload.data[big_key_data]); } /* * dispose of the links from a revoked keyring * - called with the key sem write-locked */ void big_key_revoke(struct key *key) { struct path *path = (struct path *)&key->payload.data[big_key_path]; /* clear the quota */ key_payload_reserve(key, 0); if (key_is_positive(key) && (size_t)key->payload.data[big_key_len] > BIG_KEY_FILE_THRESHOLD) vfs_truncate(path, 0); } /* * dispose of the data dangling from the corpse of a big_key key */ void big_key_destroy(struct key *key) { size_t datalen = (size_t)key->payload.data[big_key_len]; if (datalen > BIG_KEY_FILE_THRESHOLD) { struct path *path = (struct path *)&key->payload.data[big_key_path]; path_put(path); path->mnt = NULL; path->dentry = NULL; } kfree_sensitive(key->payload.data[big_key_data]); key->payload.data[big_key_data] = NULL; } /* * Update a big key */ int big_key_update(struct key *key, struct key_preparsed_payload *prep) { int ret; ret = key_payload_reserve(key, prep->datalen); if (ret < 0) return ret; if (key_is_positive(key)) big_key_destroy(key); return generic_key_instantiate(key, prep); } /* * describe the big_key key */ void big_key_describe(const struct key *key, struct seq_file *m) { size_t datalen = (size_t)key->payload.data[big_key_len]; seq_puts(m, key->description); if (key_is_positive(key)) seq_printf(m, ": %zu [%s]", datalen, datalen > BIG_KEY_FILE_THRESHOLD ? "file" : "buff"); } /* * read the key data * - the key's semaphore is read-locked */ long big_key_read(const struct key *key, char *buffer, size_t buflen) { size_t datalen = (size_t)key->payload.data[big_key_len]; long ret; if (!buffer || buflen < datalen) return datalen; if (datalen > BIG_KEY_FILE_THRESHOLD) { struct path *path = (struct path *)&key->payload.data[big_key_path]; struct file *file; u8 *buf, *enckey = (u8 *)key->payload.data[big_key_data]; size_t enclen = datalen + CHACHA20POLY1305_AUTHTAG_SIZE; loff_t pos = 0; buf = kvmalloc(enclen, GFP_KERNEL); if (!buf) return -ENOMEM; file = dentry_open(path, O_RDONLY, current_cred()); if (IS_ERR(file)) { ret = PTR_ERR(file); goto error; } /* read file to kernel and decrypt */ ret = kernel_read(file, buf, enclen, &pos); if (ret != enclen) { if (ret >= 0) ret = -EIO; goto err_fput; } ret = chacha20poly1305_decrypt(buf, buf, enclen, NULL, 0, 0, enckey) ? 0 : -EBADMSG; if (unlikely(ret)) goto err_fput; ret = datalen; /* copy out decrypted data */ memcpy(buffer, buf, datalen); err_fput: fput(file); error: kvfree_sensitive(buf, enclen); } else { ret = datalen; memcpy(buffer, key->payload.data[big_key_data], datalen); } return ret; } /* * Register key type */ static int __init big_key_init(void) { return register_key_type(&key_type_big_key); } late_initcall(big_key_init); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 | /* SPDX-License-Identifier: GPL-2.0 */ /* * USB PHY defines * * These APIs may be used between USB controllers. USB device drivers * (for either host or peripheral roles) don't use these calls; they * continue to use just usb_device and usb_gadget. */ #ifndef __LINUX_USB_PHY_H #define __LINUX_USB_PHY_H #include <linux/extcon.h> #include <linux/notifier.h> #include <linux/usb.h> #include <uapi/linux/usb/charger.h> enum usb_phy_interface { USBPHY_INTERFACE_MODE_UNKNOWN, USBPHY_INTERFACE_MODE_UTMI, USBPHY_INTERFACE_MODE_UTMIW, USBPHY_INTERFACE_MODE_ULPI, USBPHY_INTERFACE_MODE_SERIAL, USBPHY_INTERFACE_MODE_HSIC, }; enum usb_phy_events { USB_EVENT_NONE, /* no events or cable disconnected */ USB_EVENT_VBUS, /* vbus valid event */ USB_EVENT_ID, /* id was grounded */ USB_EVENT_CHARGER, /* usb dedicated charger */ USB_EVENT_ENUMERATED, /* gadget driver enumerated */ }; /* associate a type with PHY */ enum usb_phy_type { USB_PHY_TYPE_UNDEFINED, USB_PHY_TYPE_USB2, USB_PHY_TYPE_USB3, }; /* OTG defines lots of enumeration states before device reset */ enum usb_otg_state { OTG_STATE_UNDEFINED = 0, /* single-role peripheral, and dual-role default-b */ OTG_STATE_B_IDLE, OTG_STATE_B_SRP_INIT, OTG_STATE_B_PERIPHERAL, /* extra dual-role default-b states */ OTG_STATE_B_WAIT_ACON, OTG_STATE_B_HOST, /* dual-role default-a */ OTG_STATE_A_IDLE, OTG_STATE_A_WAIT_VRISE, OTG_STATE_A_WAIT_BCON, OTG_STATE_A_HOST, OTG_STATE_A_SUSPEND, OTG_STATE_A_PERIPHERAL, OTG_STATE_A_WAIT_VFALL, OTG_STATE_A_VBUS_ERR, }; struct usb_phy; struct usb_otg; /* for phys connected thru an ULPI interface, the user must * provide access ops */ struct usb_phy_io_ops { int (*read)(struct usb_phy *x, u32 reg); int (*write)(struct usb_phy *x, u32 val, u32 reg); }; struct usb_charger_current { unsigned int sdp_min; unsigned int sdp_max; unsigned int dcp_min; unsigned int dcp_max; unsigned int cdp_min; unsigned int cdp_max; unsigned int aca_min; unsigned int aca_max; }; struct usb_phy { struct device *dev; const char *label; unsigned int flags; enum usb_phy_type type; enum usb_phy_events last_event; struct usb_otg *otg; struct device *io_dev; struct usb_phy_io_ops *io_ops; void __iomem *io_priv; /* to support extcon device */ struct extcon_dev *edev; struct extcon_dev *id_edev; struct notifier_block vbus_nb; struct notifier_block id_nb; struct notifier_block type_nb; /* Support USB charger */ enum usb_charger_type chg_type; enum usb_charger_state chg_state; struct usb_charger_current chg_cur; struct work_struct chg_work; /* for notification of usb_phy_events */ struct atomic_notifier_head notifier; /* to pass extra port status to the root hub */ u16 port_status; u16 port_change; /* to support controllers that have multiple phys */ struct list_head head; /* initialize/shutdown the phy */ int (*init)(struct usb_phy *x); void (*shutdown)(struct usb_phy *x); /* enable/disable VBUS */ int (*set_vbus)(struct usb_phy *x, int on); /* effective for B devices, ignored for A-peripheral */ int (*set_power)(struct usb_phy *x, unsigned mA); /* Set phy into suspend mode */ int (*set_suspend)(struct usb_phy *x, int suspend); /* * Set wakeup enable for PHY, in that case, the PHY can be * woken up from suspend status due to external events, * like vbus change, dp/dm change and id. */ int (*set_wakeup)(struct usb_phy *x, bool enabled); /* notify phy connect status change */ int (*notify_connect)(struct usb_phy *x, enum usb_device_speed speed); int (*notify_disconnect)(struct usb_phy *x, enum usb_device_speed speed); /* * Charger detection method can be implemented if you need to * manually detect the charger type. */ enum usb_charger_type (*charger_detect)(struct usb_phy *x); }; /* for board-specific init logic */ extern int usb_add_phy(struct usb_phy *, enum usb_phy_type type); extern int usb_add_phy_dev(struct usb_phy *); extern void usb_remove_phy(struct usb_phy *); /* helpers for direct access thru low-level io interface */ static inline int usb_phy_io_read(struct usb_phy *x, u32 reg) { if (x && x->io_ops && x->io_ops->read) return x->io_ops->read(x, reg); return -EINVAL; } static inline int usb_phy_io_write(struct usb_phy *x, u32 val, u32 reg) { if (x && x->io_ops && x->io_ops->write) return x->io_ops->write(x, val, reg); return -EINVAL; } static inline int usb_phy_init(struct usb_phy *x) { if (x && x->init) return x->init(x); return 0; } static inline void usb_phy_shutdown(struct usb_phy *x) { if (x && x->shutdown) x->shutdown(x); } static inline int usb_phy_vbus_on(struct usb_phy *x) { if (!x || !x->set_vbus) return 0; return x->set_vbus(x, true); } static inline int usb_phy_vbus_off(struct usb_phy *x) { if (!x || !x->set_vbus) return 0; return x->set_vbus(x, false); } /* for usb host and peripheral controller drivers */ #if IS_ENABLED(CONFIG_USB_PHY) extern struct usb_phy *usb_get_phy(enum usb_phy_type type); extern struct usb_phy *devm_usb_get_phy(struct device *dev, enum usb_phy_type type); extern struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, const char *phandle, u8 index); extern struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, struct device_node *node, struct notifier_block *nb); extern void usb_put_phy(struct usb_phy *); extern void devm_usb_put_phy(struct device *dev, struct usb_phy *x); extern void usb_phy_set_event(struct usb_phy *x, unsigned long event); extern void usb_phy_set_charger_current(struct usb_phy *usb_phy, unsigned int mA); extern void usb_phy_get_charger_current(struct usb_phy *usb_phy, unsigned int *min, unsigned int *max); extern void usb_phy_set_charger_state(struct usb_phy *usb_phy, enum usb_charger_state state); #else static inline struct usb_phy *usb_get_phy(enum usb_phy_type type) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy(struct device *dev, enum usb_phy_type type) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, const char *phandle, u8 index) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, struct device_node *node, struct notifier_block *nb) { return ERR_PTR(-ENXIO); } static inline void usb_put_phy(struct usb_phy *x) { } static inline void devm_usb_put_phy(struct device *dev, struct usb_phy *x) { } static inline void usb_phy_set_event(struct usb_phy *x, unsigned long event) { } static inline void usb_phy_set_charger_current(struct usb_phy *usb_phy, unsigned int mA) { } static inline void usb_phy_get_charger_current(struct usb_phy *usb_phy, unsigned int *min, unsigned int *max) { } static inline void usb_phy_set_charger_state(struct usb_phy *usb_phy, enum usb_charger_state state) { } #endif static inline int usb_phy_set_power(struct usb_phy *x, unsigned mA) { if (!x) return 0; usb_phy_set_charger_current(x, mA); if (x->set_power) return x->set_power(x, mA); return 0; } /* Context: can sleep */ static inline int usb_phy_set_suspend(struct usb_phy *x, int suspend) { if (x && x->set_suspend != NULL) return x->set_suspend(x, suspend); else return 0; } static inline int usb_phy_set_wakeup(struct usb_phy *x, bool enabled) { if (x && x->set_wakeup) return x->set_wakeup(x, enabled); else return 0; } static inline int usb_phy_notify_connect(struct usb_phy *x, enum usb_device_speed speed) { if (x && x->notify_connect) return x->notify_connect(x, speed); else return 0; } static inline int usb_phy_notify_disconnect(struct usb_phy *x, enum usb_device_speed speed) { if (x && x->notify_disconnect) return x->notify_disconnect(x, speed); else return 0; } /* notifiers */ static inline int usb_register_notifier(struct usb_phy *x, struct notifier_block *nb) { return atomic_notifier_chain_register(&x->notifier, nb); } static inline void usb_unregister_notifier(struct usb_phy *x, struct notifier_block *nb) { atomic_notifier_chain_unregister(&x->notifier, nb); } static inline const char *usb_phy_type_string(enum usb_phy_type type) { switch (type) { case USB_PHY_TYPE_USB2: return "USB2 PHY"; case USB_PHY_TYPE_USB3: return "USB3 PHY"; default: return "UNKNOWN PHY TYPE"; } } #endif /* __LINUX_USB_PHY_H */ |
5 3 215 5 5 11 78 22 2 89 1 33 55 20 25 50 75 89 32 57 2 89 277 29 145 21 197 77 77 2 76 77 3 22 22 55 74 3 14 9 3 60 1 10 1 53 32 31 3 12 51 54 19 12 8 54 9 6 12 12 6 6 26 12 6 20 20 20 9 9 1 1 19 7 8 5 10 8 2 2 2 79 79 71 78 35 1 37 56 28 12 11 28 82 27 56 56 56 7 6 56 56 56 19 29 1 17 1 2 1 1 36 74 21 14 39 3 8 10 6 60 29 2 65 83 4 16 67 67 65 67 4 15 48 7 5 73 72 69 33 30 1 7 24 24 70 24 24 71 147 147 145 143 144 144 96 197 199 1 92 107 130 13 12 134 65 73 49 11 77 1 65 11 202 1 68 135 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 |