2 1 2 2 1 1 2 1 1386 1386 1 1 1 1 1 1 1 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 1 2 2 2 1 1 1 1 2 1 2 2 2 2 2 2 2 1957 1956 515 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 // SPDX-License-Identifier: GPL-2.0-only #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/xdp.h> #include <net/xdp_sock.h> #include <net/netdev_rx_queue.h> #include <net/busy_poll.h> #include "netdev-genl-gen.h" #include "dev.h" struct netdev_nl_dump_ctx { unsigned long ifindex; unsigned int rxq_idx; unsigned int txq_idx; unsigned int napi_id; }; static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) { NL_ASSERT_DUMP_CTX_FITS(struct netdev_nl_dump_ctx); return (struct netdev_nl_dump_ctx *)cb->ctx; } static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { u64 xsk_features = 0; u64 xdp_rx_meta = 0; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; #define XDP_METADATA_KFUNC(_, flag, __, xmo) \ if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \ xdp_rx_meta |= flag; XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC if (netdev->xsk_tx_metadata_ops) { if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp) xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; } if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, netdev->xdp_features, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, xdp_rx_meta, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, xsk_features, NETDEV_A_DEV_PAD)) { genlmsg_cancel(rsp, hdr); return -EINVAL; } if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, netdev->xdp_zc_max_segs)) { genlmsg_cancel(rsp, hdr); return -EINVAL; } } genlmsg_end(rsp, hdr); return 0; } static void netdev_genl_dev_notify(struct net_device *netdev, int cmd) { struct genl_info info; struct sk_buff *ntf; if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev), NETDEV_NLGRP_MGMT)) return; genl_info_init_ntf(&info, &netdev_nl_family, cmd); ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!ntf) return; if (netdev_nl_dev_fill(netdev, ntf, &info)) { nlmsg_free(ntf); return; } genlmsg_multicast_netns(&netdev_nl_family, dev_net(netdev), ntf, 0, NETDEV_NLGRP_MGMT, GFP_KERNEL); } int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net_device *netdev; struct sk_buff *rsp; u32 ifindex; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX)) return -EINVAL; ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; rtnl_lock(); netdev = __dev_get_by_index(genl_info_net(info), ifindex); if (netdev) err = netdev_nl_dev_fill(netdev, rsp, info); else err = -ENODEV; rtnl_unlock(); if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; int err = 0; rtnl_lock(); for_each_netdev_dump(net, netdev, ctx->ifindex) { err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); if (err < 0) break; } rtnl_unlock(); if (err != -EMSGSIZE) return err; return skb->len; } static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { void *hdr; pid_t pid; if (WARN_ON_ONCE(!napi->dev)) return -EINVAL; if (!(napi->dev->flags & IFF_UP)) return 0; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (napi->napi_id >= MIN_NAPI_ID && nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) goto nla_put_failure; if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) goto nla_put_failure; if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq)) goto nla_put_failure; if (napi->thread) { pid = task_pid_nr(napi->thread); if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid)) goto nla_put_failure; } genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) { struct napi_struct *napi; struct sk_buff *rsp; u32 napi_id; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) return -EINVAL; napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; rtnl_lock(); napi = napi_by_id(napi_id); if (napi) err = netdev_nl_napi_fill_one(rsp, napi, info); else err = -EINVAL; rtnl_unlock(); if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { struct napi_struct *napi; int err = 0; if (!(netdev->flags & IFF_UP)) return err; list_for_each_entry(napi, &netdev->napi_list, dev_list) { if (ctx->napi_id && napi->napi_id >= ctx->napi_id) continue; err = netdev_nl_napi_fill_one(rsp, napi, info); if (err) return err; ctx->napi_id = napi->napi_id; } return err; } int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; u32 ifindex = 0; int err = 0; if (info->attrs[NETDEV_A_NAPI_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]); rtnl_lock(); if (ifindex) { netdev = __dev_get_by_index(net, ifindex); if (netdev) err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); else err = -ENODEV; } else { for_each_netdev_dump(net, netdev, ctx->ifindex) { err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); if (err < 0) break; ctx->napi_id = 0; } } rtnl_unlock(); if (err != -EMSGSIZE) return err; return skb->len; } static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { struct netdev_rx_queue *rxq; struct netdev_queue *txq; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx) || nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type) || nla_put_u32(rsp, NETDEV_A_QUEUE_IFINDEX, netdev->ifindex)) goto nla_put_failure; switch (q_type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(netdev, q_idx); if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, rxq->napi->napi_id)) goto nla_put_failure; break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); if (txq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, txq->napi->napi_id)) goto nla_put_failure; } genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_queue_validate(struct net_device *netdev, u32 q_id, u32 q_type) { switch (q_type) { case NETDEV_QUEUE_TYPE_RX: if (q_id >= netdev->real_num_rx_queues) return -EINVAL; return 0; case NETDEV_QUEUE_TYPE_TX: if (q_id >= netdev->real_num_tx_queues) return -EINVAL; } return 0; } static int netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { int err = 0; if (!(netdev->flags & IFF_UP)) return err; err = netdev_nl_queue_validate(netdev, q_idx, q_type); if (err) return err; return netdev_nl_queue_fill_one(rsp, netdev, q_idx, q_type, info); } int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) { u32 q_id, q_type, ifindex; struct net_device *netdev; struct sk_buff *rsp; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_ID) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX)) return -EINVAL; q_id = nla_get_u32(info->attrs[NETDEV_A_QUEUE_ID]); q_type = nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]); ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; rtnl_lock(); netdev = __dev_get_by_index(genl_info_net(info), ifindex); if (netdev) err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); else err = -ENODEV; rtnl_unlock(); if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { int err = 0; int i; if (!(netdev->flags & IFF_UP)) return err; for (i = ctx->rxq_idx; i < netdev->real_num_rx_queues;) { err = netdev_nl_queue_fill_one(rsp, netdev, i, NETDEV_QUEUE_TYPE_RX, info); if (err) return err; ctx->rxq_idx = i++; } for (i = ctx->txq_idx; i < netdev->real_num_tx_queues;) { err = netdev_nl_queue_fill_one(rsp, netdev, i, NETDEV_QUEUE_TYPE_TX, info); if (err) return err; ctx->txq_idx = i++; } return err; } int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; u32 ifindex = 0; int err = 0; if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); rtnl_lock(); if (ifindex) { netdev = __dev_get_by_index(net, ifindex); if (netdev) err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); else err = -ENODEV; } else { for_each_netdev_dump(net, netdev, ctx->ifindex) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); if (err < 0) break; ctx->rxq_idx = 0; ctx->txq_idx = 0; } } rtnl_unlock(); if (err != -EMSGSIZE) return err; return skb->len; } static int netdev_genl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_REGISTER: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF); break; case NETDEV_UNREGISTER: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF); break; case NETDEV_XDP_FEAT_CHANGE: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF); break; } return NOTIFY_OK; } static struct notifier_block netdev_genl_nb = { .notifier_call = netdev_genl_netdevice_event, }; static int __init netdev_genl_init(void) { int err; err = register_netdevice_notifier(&netdev_genl_nb); if (err) return err; err = genl_register_family(&netdev_nl_family); if (err) goto err_unreg_ntf; return 0; err_unreg_ntf: unregister_netdevice_notifier(&netdev_genl_nb); return err; } subsys_initcall(netdev_genl_init);
7 6 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 // SPDX-License-Identifier: GPL-2.0-only /* * Creates audit record for dropped/accepted packets * * (C) 2010-2011 Thomas Graf <tgraf@redhat.com> * (C) 2010-2011 Red Hat, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/audit.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_AUDIT.h> #include <linux/netfilter_bridge/ebtables.h> #include <net/ipv6.h> #include <net/ip.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Thomas Graf <tgraf@redhat.com>"); MODULE_DESCRIPTION("Xtables: creates audit records for dropped/accepted packets"); MODULE_ALIAS("ipt_AUDIT"); MODULE_ALIAS("ip6t_AUDIT"); MODULE_ALIAS("ebt_AUDIT"); MODULE_ALIAS("arpt_AUDIT"); static bool audit_ip4(struct audit_buffer *ab, struct sk_buff *skb) { struct iphdr _iph; const struct iphdr *ih; ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_iph), &_iph); if (!ih) return false; audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu", &ih->saddr, &ih->daddr, ih->protocol); return true; } static bool audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) { struct ipv6hdr _ip6h; const struct ipv6hdr *ih; u8 nexthdr; __be16 frag_off; ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h); if (!ih) return false; nexthdr = ih->nexthdr; ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), &nexthdr, &frag_off); audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu", &ih->saddr, &ih->daddr, nexthdr); return true; } static unsigned int audit_tg(struct sk_buff *skb, const struct xt_action_param *par) { struct audit_buffer *ab; int fam = -1; if (audit_enabled == AUDIT_OFF) goto errout; ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT); if (ab == NULL) goto errout; audit_log_format(ab, "mark=%#x", skb->mark); switch (xt_family(par)) { case NFPROTO_BRIDGE: switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1; break; case htons(ETH_P_IPV6): fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1; break; } break; case NFPROTO_IPV4: fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1; break; case NFPROTO_IPV6: fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1; break; } if (fam == -1) audit_log_format(ab, " saddr=? daddr=? proto=-1"); audit_log_end(ab); errout: return XT_CONTINUE; } static unsigned int audit_tg_ebt(struct sk_buff *skb, const struct xt_action_param *par) { audit_tg(skb, par); return EBT_CONTINUE; } static int audit_tg_check(const struct xt_tgchk_param *par) { const struct xt_audit_info *info = par->targinfo; if (info->type > XT_AUDIT_TYPE_MAX) { pr_info_ratelimited("Audit type out of range (valid range: 0..%u)\n", XT_AUDIT_TYPE_MAX); return -ERANGE; } return 0; } static struct xt_target audit_tg_reg[] __read_mostly = { { .name = "AUDIT", .family = NFPROTO_UNSPEC, .target = audit_tg, .targetsize = sizeof(struct xt_audit_info), .checkentry = audit_tg_check, .me = THIS_MODULE, }, { .name = "AUDIT", .family = NFPROTO_BRIDGE, .target = audit_tg_ebt, .targetsize = sizeof(struct xt_audit_info), .checkentry = audit_tg_check, .me = THIS_MODULE, }, }; static int __init audit_tg_init(void) { return xt_register_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg)); } static void __exit audit_tg_exit(void) { xt_unregister_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg)); } module_init(audit_tg_init); module_exit(audit_tg_exit);
41018 41022 584 16751 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/init.h> #include <linux/export.h> #include <linux/timer.h> #include <linux/acpi_pmtmr.h> #include <linux/cpufreq.h> #include <linux/delay.h> #include <linux/clocksource.h> #include <linux/percpu.h> #include <linux/timex.h> #include <linux/static_key.h> #include <linux/static_call.h> #include <asm/hpet.h> #include <asm/timer.h> #include <asm/vgtod.h> #include <asm/time.h> #include <asm/delay.h> #include <asm/hypervisor.h> #include <asm/nmi.h> #include <asm/x86_init.h> #include <asm/geode.h> #include <asm/apic.h> #include <asm/intel-family.h> #include <asm/i8259.h> #include <asm/uv/uv.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); unsigned int __read_mostly tsc_khz; EXPORT_SYMBOL(tsc_khz); #define KHZ 1000 /* * TSC can be unstable due to cpufreq or due to unsynced TSCs */ static int __read_mostly tsc_unstable; static unsigned int __initdata tsc_early_khz; static DEFINE_STATIC_KEY_FALSE(__use_tsc); int tsc_clocksource_reliable; static int __read_mostly tsc_force_recalibrate; static u32 art_to_tsc_numerator; static u32 art_to_tsc_denominator; static u64 art_to_tsc_offset; static struct clocksource *art_related_clocksource; struct cyc2ns { struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ seqcount_latch_t seq; /* 32 + 4 = 36 */ }; /* fits one cacheline */ static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); static int __init tsc_early_khz_setup(char *buf) { return kstrtouint(buf, 0, &tsc_early_khz); } early_param("tsc_early_khz", tsc_early_khz_setup); __always_inline void __cyc2ns_read(struct cyc2ns_data *data) { int seq, idx; do { seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); idx = seq & 1; data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); } __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) { preempt_disable_notrace(); __cyc2ns_read(data); } __always_inline void cyc2ns_read_end(void) { preempt_enable_notrace(); } /* * Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: * ns = cycles / (freq / ns_per_sec) * ns = cycles * (ns_per_sec / freq) * ns = cycles * (10^9 / (cpu_khz * 10^3)) * ns = cycles * (10^6 / cpu_khz) * * Then we use scaling math (suggested by george@mvista.com) to get: * ns = cycles * (10^6 * SC / cpu_khz) / SC * ns = cycles * cyc2ns_scale / SC * * And since SC is a constant power of two, we can convert the div * into a shift. The larger SC is, the more accurate the conversion, but * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication * (64-bit result) can be used. * * We can use khz divisor instead of mhz to keep a better precision. * (mathieu.desnoyers@polymtl.ca) * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; unsigned long long ns; __cyc2ns_read(&data); ns = data.cyc2ns_offset; ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); return ns; } static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) { unsigned long long ns; preempt_disable_notrace(); ns = __cycles_2_ns(cyc); preempt_enable_notrace(); return ns; } static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long long ns_now; struct cyc2ns_data data; struct cyc2ns *c2n; ns_now = cycles_2_ns(tsc_now); /* * Compute a new multiplier as per the above comment and ensure our * time function is continuous; see the comment near struct * cyc2ns_data. */ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz, NSEC_PER_MSEC, 0); /* * cyc2ns_shift is exported via arch_perf_update_userpage() where it is * not expected to be greater than 31 due to the original published * conversion algorithm shifting a 32-bit value (now specifies a 64-bit * value) - refer perf_event_mmap_page documentation in perf_event.h. */ if (data.cyc2ns_shift == 32) { data.cyc2ns_shift = 31; data.cyc2ns_mul >>= 1; } data.cyc2ns_offset = ns_now - mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift); c2n = per_cpu_ptr(&cyc2ns, cpu); raw_write_seqcount_latch(&c2n->seq); c2n->data[0] = data; raw_write_seqcount_latch(&c2n->seq); c2n->data[1] = data; } static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long flags; local_irq_save(flags); sched_clock_idle_sleep_event(); if (khz) __set_cyc2ns_scale(khz, cpu, tsc_now); sched_clock_idle_wakeup_event(); local_irq_restore(flags); } /* * Initialize cyc2ns for boot cpu */ static void __init cyc2ns_init_boot_cpu(void) { struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); seqcount_latch_init(&c2n->seq); __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); } /* * Secondary CPUs do not run through tsc_init(), so set up * all the scale factors for all CPUs, assuming the same * speed as the bootup CPU. */ static void __init cyc2ns_init_secondary_cpus(void) { unsigned int cpu, this_cpu = smp_processor_id(); struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); struct cyc2ns_data *data = c2n->data; for_each_possible_cpu(cpu) { if (cpu != this_cpu) { seqcount_latch_init(&c2n->seq); c2n = per_cpu_ptr(&cyc2ns, cpu); c2n->data[0] = data[0]; c2n->data[1] = data[1]; } } } /* * Scheduler clock - returns current time in nanosec units. */ noinstr u64 native_sched_clock(void) { if (static_branch_likely(&__use_tsc)) { u64 tsc_now = rdtsc(); /* return the value in ns */ return __cycles_2_ns(tsc_now); } /* * Fall back to jiffies if there's no TSC available: * ( But note that we still use it if the TSC is marked * unstable. We do this because unlike Time Of Day, * the scheduler clock tolerates small errors and it's * very important for it to be as fast as the platform * can achieve it. ) */ /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); } /* * Generate a sched_clock if you already have a TSC value. */ u64 native_sched_clock_from_tsc(u64 tsc) { return cycles_2_ns(tsc); } /* We need to define a real function for sched_clock, to override the weak default version */ #ifdef CONFIG_PARAVIRT noinstr u64 sched_clock_noinstr(void) { return paravirt_sched_clock(); } bool using_native_sched_clock(void) { return static_call_query(pv_sched_clock) == native_sched_clock; } #else u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock"))); bool using_native_sched_clock(void) { return true; } #endif notrace u64 sched_clock(void) { u64 now; preempt_disable_notrace(); now = sched_clock_noinstr(); preempt_enable_notrace(); return now; } int check_tsc_unstable(void) { return tsc_unstable; } EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { mark_tsc_unstable("boot parameter notsc"); return 1; } #else /* * disable flag for tsc. Takes effect by clearing the TSC cpu flag * in cpu/common.c */ int __init notsc_setup(char *str) { setup_clear_cpu_cap(X86_FEATURE_TSC); return 1; } #endif __setup("notsc", notsc_setup); static int no_sched_irq_time; static int no_tsc_watchdog; static int tsc_as_watchdog; static int __init tsc_setup(char *str) { if (!strcmp(str, "reliable")) tsc_clocksource_reliable = 1; if (!strncmp(str, "noirqtime", 9)) no_sched_irq_time = 1; if (!strcmp(str, "unstable")) mark_tsc_unstable("boot parameter"); if (!strcmp(str, "nowatchdog")) { no_tsc_watchdog = 1; if (tsc_as_watchdog) pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n", __func__); tsc_as_watchdog = 0; } if (!strcmp(str, "recalibrate")) tsc_force_recalibrate = 1; if (!strcmp(str, "watchdog")) { if (no_tsc_watchdog) pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n", __func__); else tsc_as_watchdog = 1; } return 1; } __setup("tsc=", tsc_setup); #define MAX_RETRIES 5 #define TSC_DEFAULT_THRESHOLD 0x20000 /* * Read TSC and the reference counters. Take care of any disturbances */ static u64 tsc_read_refs(u64 *p, int hpet) { u64 t1, t2; u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD; int i; for (i = 0; i < MAX_RETRIES; i++) { t1 = get_cycles(); if (hpet) *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; else *p = acpi_pm_read_early(); t2 = get_cycles(); if ((t2 - t1) < thresh) return t2; } return ULLONG_MAX; } /* * Calculate the TSC frequency from HPET reference */ static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2) { u64 tmp; if (hpet2 < hpet1) hpet2 += 0x100000000ULL; hpet2 -= hpet1; tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); do_div(tmp, 1000000); deltatsc = div64_u64(deltatsc, tmp); return (unsigned long) deltatsc; } /* * Calculate the TSC frequency from PMTimer reference */ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) { u64 tmp; if (!pm1 && !pm2) return ULONG_MAX; if (pm2 < pm1) pm2 += (u64)ACPI_PM_OVRRUN; pm2 -= pm1; tmp = pm2 * 1000000000LL; do_div(tmp, PMTMR_TICKS_PER_SEC); do_div(deltatsc, tmp); return (unsigned long) deltatsc; } #define CAL_MS 10 #define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS)) #define CAL_PIT_LOOPS 1000 #define CAL2_MS 50 #define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS)) #define CAL2_PIT_LOOPS 5000 /* * Try to calibrate the TSC against the Programmable * Interrupt Timer and return the frequency of the TSC * in kHz. * * Return ULONG_MAX on failure to calibrate. */ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) { u64 tsc, t1, t2, delta; unsigned long tscmin, tscmax; int pitcnt; if (!has_legacy_pic()) { /* * Relies on tsc_early_delay_calibrate() to have given us semi * usable udelay(), wait for the same 50ms we would have with * the PIT loop below. */ udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); return ULONG_MAX; } /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); /* * Setup CTC channel 2* for mode 0, (interrupt on terminal * count mode), binary count. Set the latch register to 50ms * (LSB then MSB) to begin countdown. */ outb(0xb0, 0x43); outb(latch & 0xff, 0x42); outb(latch >> 8, 0x42); tsc = t1 = t2 = get_cycles(); pitcnt = 0; tscmax = 0; tscmin = ULONG_MAX; while ((inb(0x61) & 0x20) == 0) { t2 = get_cycles(); delta = t2 - tsc; tsc = t2; if ((unsigned long) delta < tscmin) tscmin = (unsigned int) delta; if ((unsigned long) delta > tscmax) tscmax = (unsigned int) delta; pitcnt++; } /* * Sanity checks: * * If we were not able to read the PIT more than loopmin * times, then we have been hit by a massive SMI * * If the maximum is 10 times larger than the minimum, * then we got hit by an SMI as well. */ if (pitcnt < loopmin || tscmax > 10 * tscmin) return ULONG_MAX; /* Calculate the PIT value */ delta = t2 - t1; do_div(delta, ms); return delta; } /* * This reads the current MSB of the PIT counter, and * checks if we are running on sufficiently fast and * non-virtualized hardware. * * Our expectations are: * * - the PIT is running at roughly 1.19MHz * * - each IO is going to take about 1us on real hardware, * but we allow it to be much faster (by a factor of 10) or * _slightly_ slower (ie we allow up to a 2us read+counter * update - anything else implies a unacceptably slow CPU * or PIT for the fast calibration to work. * * - with 256 PIT ticks to read the value, we have 214us to * see the same MSB (and overhead like doing a single TSC * read per MSB value etc). * * - We're doing 2 reads per loop (LSB, MSB), and we expect * them each to take about a microsecond on real hardware. * So we expect a count value of around 100. But we'll be * generous, and accept anything over 50. * * - if the PIT is stuck, and we see *many* more reads, we * return early (and the next caller of pit_expect_msb() * then consider it a failure when they don't see the * next expected value). * * These expectations mean that we know that we have seen the * transition from one expected value to another with a fairly * high accuracy, and we didn't miss any events. We can thus * use the TSC value at the transitions to calculate a pretty * good value for the TSC frequency. */ static inline int pit_verify_msb(unsigned char val) { /* Ignore LSB */ inb(0x42); return inb(0x42) == val; } static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { int count; u64 tsc = 0, prev_tsc = 0; for (count = 0; count < 50000; count++) { if (!pit_verify_msb(val)) break; prev_tsc = tsc; tsc = get_cycles(); } *deltap = get_cycles() - prev_tsc; *tscp = tsc; /* * We require _some_ success, but the quality control * will be based on the error terms on the TSC values. */ return count > 5; } /* * How many MSB values do we want to see? We aim for * a maximum error rate of 500ppm (in practice the * real error is much smaller), but refuse to spend * more than 50ms on it. */ #define MAX_QUICK_PIT_MS 50 #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) static unsigned long quick_pit_calibrate(void) { int i; u64 tsc, delta; unsigned long d1, d2; if (!has_legacy_pic()) return 0; /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); /* * Counter 2, mode 0 (one-shot), binary count * * NOTE! Mode 2 decrements by two (and then the * output is flipped each time, giving the same * final output frequency as a decrement-by-one), * so mode 0 is much better when looking at the * individual counts. */ outb(0xb0, 0x43); /* Start at 0xffff */ outb(0xff, 0x42); outb(0xff, 0x42); /* * The PIT starts counting at the next edge, so we * need to delay for a microsecond. The easiest way * to do that is to just read back the 16-bit counter * once from the PIT. */ pit_verify_msb(0); if (pit_expect_msb(0xff, &tsc, &d1)) { for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { if (!pit_expect_msb(0xff-i, &delta, &d2)) break; delta -= tsc; /* * Extrapolate the error and fail fast if the error will * never be below 500 ppm. */ if (i == 1 && d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11) return 0; /* * Iterate until the error is less than 500 ppm */ if (d1+d2 >= delta >> 11) continue; /* * Check the PIT one more time to verify that * all TSC reads were stable wrt the PIT. * * This also guarantees serialization of the * last cycle read ('d2') in pit_expect_msb. */ if (!pit_verify_msb(0xfe - i)) break; goto success; } } pr_info("Fast TSC calibration failed\n"); return 0; success: /* * Ok, if we get here, then we've seen the * MSB of the PIT decrement 'i' times, and the * error has shrunk to less than 500 ppm. * * As a result, we can depend on there not being * any odd delays anywhere, and the TSC reads are * reliable (within the error). * * kHz = ticks / time-in-seconds / 1000; * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) */ delta *= PIT_TICK_RATE; do_div(delta, i*256*1000); pr_info("Fast TSC calibration using PIT\n"); return delta; } /** * native_calibrate_tsc * Determine TSC frequency via CPUID, else return 0. */ unsigned long native_calibrate_tsc(void) { unsigned int eax_denominator, ebx_numerator, ecx_hz, edx; unsigned int crystal_khz; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; if (boot_cpu_data.cpuid_level < 0x15) return 0; eax_denominator = ebx_numerator = ecx_hz = edx = 0; /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); if (ebx_numerator == 0 || eax_denominator == 0) return 0; crystal_khz = ecx_hz / 1000; /* * Denverton SoCs don't report crystal clock, and also don't support * CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal * clock. */ if (crystal_khz == 0 && boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_D) crystal_khz = 25000; /* * TSC frequency reported directly by CPUID is a "hardware reported" * frequency and is the most accurate one so far we have. This * is considered a known frequency. */ if (crystal_khz != 0) setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); /* * Some Intel SoCs like Skylake and Kabylake don't report the crystal * clock, but we can easily calculate it to a high degree of accuracy * by considering the crystal ratio and the CPU speed. */ if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) { unsigned int eax_base_mhz, ebx, ecx, edx; cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx); crystal_khz = eax_base_mhz * 1000 * eax_denominator / ebx_numerator; } if (crystal_khz == 0) return 0; /* * For Atom SoCs TSC is the only reliable clocksource. * Mark TSC reliable so no watchdog on it. */ if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); #ifdef CONFIG_X86_LOCAL_APIC /* * The local APIC appears to be fed by the core crystal clock * (which sounds entirely sensible). We can set the global * lapic_timer_period here to avoid having to calibrate the APIC * timer later. */ lapic_timer_period = crystal_khz * 1000 / HZ; #endif return crystal_khz * ebx_numerator / eax_denominator; } static unsigned long cpu_khz_from_cpuid(void) { unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; if (boot_cpu_data.cpuid_level < 0x16) return 0; eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0; cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); return eax_base_mhz * 1000; } /* * calibrate cpu using pit, hpet, and ptimer methods. They are available * later in boot after acpi is initialized. */ static unsigned long pit_hpet_ptimer_calibrate_cpu(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; unsigned long flags, latch, ms; int hpet = is_hpet_enabled(), i, loopmin; /* * Run 5 calibration loops to get the lowest frequency value * (the best estimate). We use two different calibration modes * here: * * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and * load a timeout of 50ms. We read the time right after we * started the timer and wait until the PIT count down reaches * zero. In each wait loop iteration we read the TSC and check * the delta to the previous read. We keep track of the min * and max values of that delta. The delta is mostly defined * by the IO time of the PIT access, so we can detect when * any disturbance happened between the two reads. If the * maximum time is significantly larger than the minimum time, * then we discard the result and have another try. * * 2) Reference counter. If available we use the HPET or the * PMTIMER as a reference to check the sanity of that value. * We use separate TSC readouts and check inside of the * reference read for any possible disturbance. We discard * disturbed values here as well. We do that around the PIT * calibration delay loop as we have to wait for a certain * amount of time anyway. */ /* Preset PIT loop values */ latch = CAL_LATCH; ms = CAL_MS; loopmin = CAL_PIT_LOOPS; for (i = 0; i < 3; i++) { unsigned long tsc_pit_khz; /* * Read the start value and the reference count of * hpet/pmtimer when available. Then do the PIT * calibration, which will take at least 50ms, and * read the end value. */ local_irq_save(flags); tsc1 = tsc_read_refs(&ref1, hpet); tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin); tsc2 = tsc_read_refs(&ref2, hpet); local_irq_restore(flags); /* Pick the lowest PIT TSC calibration so far */ tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); /* hpet or pmtimer available ? */ if (ref1 == ref2) continue; /* Check, whether the sampling was disturbed */ if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) continue; tsc2 = (tsc2 - tsc1) * 1000000LL; if (hpet) tsc2 = calc_hpet_ref(tsc2, ref1, ref2); else tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2); tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); /* Check the reference deviation */ delta = ((u64) tsc_pit_min) * 100; do_div(delta, tsc_ref_min); /* * If both calibration results are inside a 10% window * then we can be sure, that the calibration * succeeded. We break out of the loop right away. We * use the reference value, as it is more precise. */ if (delta >= 90 && delta <= 110) { pr_info("PIT calibration matches %s. %d loops\n", hpet ? "HPET" : "PMTIMER", i + 1); return tsc_ref_min; } /* * Check whether PIT failed more than once. This * happens in virtualized environments. We need to * give the virtual PC a slightly longer timeframe for * the HPET/PMTIMER to make the result precise. */ if (i == 1 && tsc_pit_min == ULONG_MAX) { latch = CAL2_LATCH; ms = CAL2_MS; loopmin = CAL2_PIT_LOOPS; } } /* * Now check the results. */ if (tsc_pit_min == ULONG_MAX) { /* PIT gave no useful value */ pr_warn("Unable to calibrate against PIT\n"); /* We don't have an alternative source, disable TSC */ if (!hpet && !ref1 && !ref2) { pr_notice("No reference (HPET/PMTIMER) available\n"); return 0; } /* The alternative source failed as well, disable TSC */ if (tsc_ref_min == ULONG_MAX) { pr_warn("HPET/PMTIMER calibration failed\n"); return 0; } /* Use the alternative source */ pr_info("using %s reference calibration\n", hpet ? "HPET" : "PMTIMER"); return tsc_ref_min; } /* We don't have an alternative source, use the PIT calibration value */ if (!hpet && !ref1 && !ref2) { pr_info("Using PIT calibration value\n"); return tsc_pit_min; } /* The alternative source failed, use the PIT calibration value */ if (tsc_ref_min == ULONG_MAX) { pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n"); return tsc_pit_min; } /* * The calibration values differ too much. In doubt, we use * the PIT value as we know that there are PMTIMERs around * running at double speed. At least we let the user know: */ pr_warn("PIT calibration deviates from %s: %lu %lu\n", hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); pr_info("Using PIT calibration value\n"); return tsc_pit_min; } /** * native_calibrate_cpu_early - can calibrate the cpu early in boot */ unsigned long native_calibrate_cpu_early(void) { unsigned long flags, fast_calibrate = cpu_khz_from_cpuid(); if (!fast_calibrate) fast_calibrate = cpu_khz_from_msr(); if (!fast_calibrate) { local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); } return fast_calibrate; } /** * native_calibrate_cpu - calibrate the cpu */ static unsigned long native_calibrate_cpu(void) { unsigned long tsc_freq = native_calibrate_cpu_early(); if (!tsc_freq) tsc_freq = pit_hpet_ptimer_calibrate_cpu(); return tsc_freq; } void recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP unsigned long cpu_khz_old = cpu_khz; if (!boot_cpu_has(X86_FEATURE_TSC)) return; cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); if (tsc_khz == 0) tsc_khz = cpu_khz; else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, cpu_khz_old, cpu_khz); #endif } EXPORT_SYMBOL_GPL(recalibrate_cpu_khz); static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) { if (!sched_clock_stable()) return; cyc2ns_suspend = sched_clock(); } /* * Even on processors with invariant TSC, TSC gets reset in some the * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to * arbitrary value (still sync'd across cpu's) during resume from such sleep * states. To cope up with this, recompute the cyc2ns_offset for each cpu so * that sched_clock() continues from the point where it was left off during * suspend. */ void tsc_restore_sched_clock_state(void) { unsigned long long offset; unsigned long flags; int cpu; if (!sched_clock_stable()) return; local_irq_save(flags); /* * We're coming out of suspend, there's no concurrency yet; don't * bother being nice about the RCU stuff, just write to both * data fields. */ this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); offset = cyc2ns_suspend - sched_clock(); for_each_possible_cpu(cpu) { per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; } local_irq_restore(flags); } #ifdef CONFIG_CPU_FREQ /* * Frequency scaling support. Adjust the TSC based timer when the CPU frequency * changes. * * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC * as unstable and give up in those cases. * * Should fix up last_tsc too. Currently gettimeofday in the * first tick after the change will be slightly wrong. */ static unsigned int ref_freq; static unsigned long loops_per_jiffy_ref; static unsigned long tsc_khz_ref; static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; if (num_online_cpus() > 1) { mark_tsc_unstable("cpufreq changes on SMP"); return 0; } if (!ref_freq) { ref_freq = freq->old; loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy; tsc_khz_ref = tsc_khz; } if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { boot_cpu_data.loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) mark_tsc_unstable("cpufreq changes"); set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc()); } return 0; } static struct notifier_block time_cpufreq_notifier_block = { .notifier_call = time_cpufreq_notifier }; static int __init cpufreq_register_tsc_scaling(void) { if (!boot_cpu_has(X86_FEATURE_TSC)) return 0; if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; cpufreq_register_notifier(&time_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); return 0; } core_initcall(cpufreq_register_tsc_scaling); #endif /* CONFIG_CPU_FREQ */ #define ART_CPUID_LEAF (0x15) #define ART_MIN_DENOMINATOR (1) /* * If ART is present detect the numerator:denominator to convert to TSC */ static void __init detect_art(void) { unsigned int unused[2]; if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) return; /* * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required, * and the TSC counter resets must not occur asynchronously. */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || !boot_cpu_has(X86_FEATURE_TSC_ADJUST) || tsc_async_resets) return; cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, &art_to_tsc_numerator, unused, unused+1); if (art_to_tsc_denominator < ART_MIN_DENOMINATOR) return; rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset); /* Make this sticky over multiple CPU init calls */ setup_force_cpu_cap(X86_FEATURE_ART); } /* clocksource code */ static void tsc_resume(struct clocksource *cs) { tsc_verify_tsc_adjust(true); } /* * We used to compare the TSC to the cycle_last value in the clocksource * structure to avoid a nasty time-warp. This can be observed in a * very small window right after one CPU updated cycle_last under * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which * is smaller than the cycle_last reference value due to a TSC which * is slightly behind. This delta is nowhere else observable, but in * that case it results in a forward time jump in the range of hours * due to the unsigned delta calculation of the time keeping core * code, which is necessary to support wrapping clocksources like pm * timer. * * This sanity check is now done in the core timekeeping code. * checking the result of read_tsc() - cycle_last for being negative. * That works because CLOCKSOURCE_MASK(64) does not mask out any bit. */ static u64 read_tsc(struct clocksource *cs) { return (u64)rdtsc_ordered(); } static void tsc_cs_mark_unstable(struct clocksource *cs) { if (tsc_unstable) return; tsc_unstable = 1; if (using_native_sched_clock()) clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to clocksource watchdog\n"); } static void tsc_cs_tick_stable(struct clocksource *cs) { if (tsc_unstable) return; if (using_native_sched_clock()) sched_clock_tick_stable(); } static int tsc_cs_enable(struct clocksource *cs) { vclocks_set_used(VDSO_CLOCKMODE_TSC); return 0; } /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ static struct clocksource clocksource_tsc_early = { .name = "tsc-early", .rating = 299, .uncertainty_margin = 32 * NSEC_PER_MSEC, .read = read_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, .list = LIST_HEAD_INIT(clocksource_tsc_early.list), }; /* * Must mark VALID_FOR_HRES early such that when we unregister tsc_early * this one will immediately take over. We will only register if TSC has * been found good. */ static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_MUST_VERIFY | CLOCK_SOURCE_VERIFY_PERCPU, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, .list = LIST_HEAD_INIT(clocksource_tsc.list), }; void mark_tsc_unstable(char *reason) { if (tsc_unstable) return; tsc_unstable = 1; if (using_native_sched_clock()) clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to %s\n", reason); clocksource_mark_unstable(&clocksource_tsc_early); clocksource_mark_unstable(&clocksource_tsc); } EXPORT_SYMBOL_GPL(mark_tsc_unstable); static void __init tsc_disable_clocksource_watchdog(void) { clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; } bool tsc_clocksource_watchdog_disabled(void) { return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) && tsc_as_watchdog && !no_tsc_watchdog; } static void __init check_system_tsc_reliable(void) { #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC) if (is_geode_lx()) { /* RTSC counts during suspend */ #define RTSC_SUSP 0x100 unsigned long res_low, res_high; rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); /* Geode_LX - the OLPC CPU has a very reliable TSC */ if (res_low & RTSC_SUSP) tsc_clocksource_reliable = 1; } #endif if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) tsc_clocksource_reliable = 1; /* * Disable the clocksource watchdog when the system has: * - TSC running at constant frequency * - TSC which does not stop in C-States * - the TSC_ADJUST register which allows to detect even minimal * modifications * - not more than two sockets. As the number of sockets cannot be * evaluated at the early boot stage where this has to be * invoked, check the number of online memory nodes as a * fallback solution which is an reasonable estimate. */ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && boot_cpu_has(X86_FEATURE_TSC_ADJUST) && nr_online_nodes <= 4) tsc_disable_clocksource_watchdog(); } /* * Make an educated guess if the TSC is trustworthy and synchronized * over all CPUs. */ int unsynchronized_tsc(void) { if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable) return 1; #ifdef CONFIG_SMP if (apic_is_clustered_box()) return 1; #endif if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; if (tsc_clocksource_reliable) return 0; /* * Intel systems are normally all synchronized. * Exceptions must mark TSC as unstable: */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { /* assume multi socket systems are not synchronized: */ if (num_possible_cpus() > 1) return 1; } return 0; } /* * Convert ART to TSC given numerator/denominator found in detect_art() */ struct system_counterval_t convert_art_to_tsc(u64 art) { u64 tmp, res, rem; rem = do_div(art, art_to_tsc_denominator); res = art * art_to_tsc_numerator; tmp = rem * art_to_tsc_numerator; do_div(tmp, art_to_tsc_denominator); res += tmp + art_to_tsc_offset; return (struct system_counterval_t) {.cs = art_related_clocksource, .cycles = res}; } EXPORT_SYMBOL(convert_art_to_tsc); /** * convert_art_ns_to_tsc() - Convert ART in nanoseconds to TSC. * @art_ns: ART (Always Running Timer) in unit of nanoseconds * * PTM requires all timestamps to be in units of nanoseconds. When user * software requests a cross-timestamp, this function converts system timestamp * to TSC. * * This is valid when CPU feature flag X86_FEATURE_TSC_KNOWN_FREQ is set * indicating the tsc_khz is derived from CPUID[15H]. Drivers should check * that this flag is set before conversion to TSC is attempted. * * Return: * struct system_counterval_t - system counter value with the pointer to the * corresponding clocksource * @cycles: System counter value * @cs: Clocksource corresponding to system counter value. Used * by timekeeping code to verify comparability of two cycle * values. */ struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns) { u64 tmp, res, rem; rem = do_div(art_ns, USEC_PER_SEC); res = art_ns * tsc_khz; tmp = rem * tsc_khz; do_div(tmp, USEC_PER_SEC); res += tmp; return (struct system_counterval_t) { .cs = art_related_clocksource, .cycles = res}; } EXPORT_SYMBOL(convert_art_ns_to_tsc); static void tsc_refine_calibration_work(struct work_struct *work); static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); /** * tsc_refine_calibration_work - Further refine tsc freq calibration * @work - ignored. * * This functions uses delayed work over a period of a * second to further refine the TSC freq value. Since this is * timer based, instead of loop based, we don't block the boot * process while this longer calibration is done. * * If there are any calibration anomalies (too many SMIs, etc), * or the refined calibration is off by 1% of the fast early * calibration, we throw out the new calibration and use the * early calibration. */ static void tsc_refine_calibration_work(struct work_struct *work) { static u64 tsc_start = ULLONG_MAX, ref_start; static int hpet; u64 tsc_stop, ref_stop, delta; unsigned long freq; int cpu; /* Don't bother refining TSC on unstable systems */ if (tsc_unstable) goto unreg; /* * Since the work is started early in boot, we may be * delayed the first time we expire. So set the workqueue * again once we know timers are working. */ if (tsc_start == ULLONG_MAX) { restart: /* * Only set hpet once, to avoid mixing hardware * if the hpet becomes enabled later. */ hpet = is_hpet_enabled(); tsc_start = tsc_read_refs(&ref_start, hpet); schedule_delayed_work(&tsc_irqwork, HZ); return; } tsc_stop = tsc_read_refs(&ref_stop, hpet); /* hpet or pmtimer available ? */ if (ref_start == ref_stop) goto out; /* Check, whether the sampling was disturbed */ if (tsc_stop == ULLONG_MAX) goto restart; delta = tsc_stop - tsc_start; delta *= 1000000LL; if (hpet) freq = calc_hpet_ref(delta, ref_start, ref_stop); else freq = calc_pmtimer_ref(delta, ref_start, ref_stop); /* Will hit this only if tsc_force_recalibrate has been set */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { /* Warn if the deviation exceeds 500 ppm */ if (abs(tsc_khz - freq) > (tsc_khz >> 11)) { pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n"); pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n", (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); } pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n", hpet ? "HPET" : "PM_TIMER", (unsigned long)freq / 1000, (unsigned long)freq % 1000); return; } /* Make sure we're within 1% */ if (abs(tsc_khz - freq) > tsc_khz/100) goto out; tsc_khz = freq; pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n", (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); /* Inform the TSC deadline clockevent devices about the recalibration */ lapic_update_tsc_freq(); /* Update the sched_clock() rate to match the clocksource one */ for_each_possible_cpu(cpu) set_cyc2ns_scale(tsc_khz, cpu, tsc_stop); out: if (tsc_unstable) goto unreg; if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); unreg: clocksource_unregister(&clocksource_tsc_early); } static int __init init_tsc_clocksource(void) { if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz) return 0; if (tsc_unstable) { clocksource_unregister(&clocksource_tsc_early); return 0; } if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; /* * When TSC frequency is known (retrieved via MSR or CPUID), we skip * the refined calibration and directly register it as a clocksource. */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); clocksource_unregister(&clocksource_tsc_early); if (!tsc_force_recalibrate) return 0; } schedule_delayed_work(&tsc_irqwork, 0); return 0; } /* * We use device_initcall here, to ensure we run after the hpet * is fully initialized, which may occur at fs_initcall time. */ device_initcall(init_tsc_clocksource); static bool __init determine_cpu_tsc_frequencies(bool early) { /* Make sure that cpu and tsc are not already calibrated */ WARN_ON(cpu_khz || tsc_khz); if (early) { cpu_khz = x86_platform.calibrate_cpu(); if (tsc_early_khz) tsc_khz = tsc_early_khz; else tsc_khz = x86_platform.calibrate_tsc(); } else { /* We should not be here with non-native cpu calibration */ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); cpu_khz = pit_hpet_ptimer_calibrate_cpu(); } /* * Trust non-zero tsc_khz as authoritative, * and use it to sanity check cpu_khz, * which will be off if system timer is off. */ if (tsc_khz == 0) tsc_khz = cpu_khz; else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; if (tsc_khz == 0) return false; pr_info("Detected %lu.%03lu MHz processor\n", (unsigned long)cpu_khz / KHZ, (unsigned long)cpu_khz % KHZ); if (cpu_khz != tsc_khz) { pr_info("Detected %lu.%03lu MHz TSC", (unsigned long)tsc_khz / KHZ, (unsigned long)tsc_khz % KHZ); } return true; } static unsigned long __init get_loops_per_jiffy(void) { u64 lpj = (u64)tsc_khz * KHZ; do_div(lpj, HZ); return lpj; } static void __init tsc_enable_sched_clock(void) { loops_per_jiffy = get_loops_per_jiffy(); use_tsc_delay(); /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); cyc2ns_init_boot_cpu(); static_branch_enable(&__use_tsc); } void __init tsc_early_init(void) { if (!boot_cpu_has(X86_FEATURE_TSC)) return; /* Don't change UV TSC multi-chassis synchronization */ if (is_early_uv_system()) return; if (!determine_cpu_tsc_frequencies(true)) return; tsc_enable_sched_clock(); } void __init tsc_init(void) { if (!cpu_feature_enabled(X86_FEATURE_TSC)) { setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } /* * native_calibrate_cpu_early can only calibrate using methods that are * available early in boot. */ if (x86_platform.calibrate_cpu == native_calibrate_cpu_early) x86_platform.calibrate_cpu = native_calibrate_cpu; if (!tsc_khz) { /* We failed to determine frequencies earlier, try again */ if (!determine_cpu_tsc_frequencies(false)) { mark_tsc_unstable("could not calculate TSC khz"); setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } tsc_enable_sched_clock(); } cyc2ns_init_secondary_cpus(); if (!no_sched_irq_time) enable_sched_clock_irqtime(); lpj_fine = get_loops_per_jiffy(); check_system_tsc_reliable(); if (unsynchronized_tsc()) { mark_tsc_unstable("TSCs unsynchronized"); return; } if (tsc_clocksource_reliable || no_tsc_watchdog) tsc_disable_clocksource_watchdog(); clocksource_register_khz(&clocksource_tsc_early, tsc_khz); detect_art(); } #ifdef CONFIG_SMP /* * Check whether existing calibration data can be reused. */ unsigned long calibrate_delay_is_known(void) { int sibling, cpu = smp_processor_id(); int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); const struct cpumask *mask = topology_core_cpumask(cpu); /* * If TSC has constant frequency and TSC is synchronized across * sockets then reuse CPU0 calibration. */ if (constant_tsc && !tsc_unstable) return cpu_data(0).loops_per_jiffy; /* * If TSC has constant frequency and TSC is not synchronized across * sockets and this is not the first CPU in the socket, then reuse * the calibration value of an already online CPU on that socket. * * This assumes that CONSTANT_TSC is consistent for all CPUs in a * socket. */ if (!constant_tsc || !mask) return 0; sibling = cpumask_any_but(mask, cpu); if (sibling < nr_cpu_ids) return cpu_data(sibling).loops_per_jiffy; return 0; } #endif
7 9 1547 1547 9 9 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * SHA1 Secure Hash Algorithm. * * Derived from cryptoapi implementation, adapted for in-place * scatterlist interface. * * Copyright (c) Alan Smithee. * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> */ #include <crypto/internal/hash.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <crypto/sha1.h> #include <crypto/sha1_base.h> #include <asm/byteorder.h> const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE] = { 0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d, 0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90, 0xaf, 0xd8, 0x07, 0x09 }; EXPORT_SYMBOL_GPL(sha1_zero_message_hash); static void sha1_generic_block_fn(struct sha1_state *sst, u8 const *src, int blocks) { u32 temp[SHA1_WORKSPACE_WORDS]; while (blocks--) { sha1_transform(sst->state, src, temp); src += SHA1_BLOCK_SIZE; } memzero_explicit(temp, sizeof(temp)); } int crypto_sha1_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha1_base_do_update(desc, data, len, sha1_generic_block_fn); } EXPORT_SYMBOL(crypto_sha1_update); static int sha1_final(struct shash_desc *desc, u8 *out) { sha1_base_do_finalize(desc, sha1_generic_block_fn); return sha1_base_finish(desc, out); } int crypto_sha1_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { sha1_base_do_update(desc, data, len, sha1_generic_block_fn); return sha1_final(desc, out); } EXPORT_SYMBOL(crypto_sha1_finup); static struct shash_alg alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = crypto_sha1_update, .final = sha1_final, .finup = crypto_sha1_finup, .descsize = sizeof(struct sha1_state), .base = { .cra_name = "sha1", .cra_driver_name= "sha1-generic", .cra_priority = 100, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } }; static int __init sha1_generic_mod_init(void) { return crypto_register_shash(&alg); } static void __exit sha1_generic_mod_fini(void) { crypto_unregister_shash(&alg); } subsys_initcall(sha1_generic_mod_init); module_exit(sha1_generic_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm"); MODULE_ALIAS_CRYPTO("sha1"); MODULE_ALIAS_CRYPTO("sha1-generic");
4 4 4 2 1 1 1 3 2 1 1 1 1 1 28 28 27 5 18 18 17 1 19 19 3 3 1 3 3 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 // SPDX-License-Identifier: GPL-2.0-only /* * truncate.c * * PURPOSE * Truncate handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * (C) 1999-2004 Ben Fennema * (C) 1999 Stelias Computing Inc * * HISTORY * * 02/24/99 blf Created. * */ #include "udfdecl.h" #include <linux/fs.h> #include <linux/mm.h> #include "udf_i.h" #include "udf_sb.h" static void extent_trunc(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen, uint32_t nelen) { struct kernel_lb_addr neloc = {}; int last_block = (elen + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; int first_block = (nelen + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (nelen) { if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { udf_free_blocks(inode->i_sb, inode, eloc, 0, last_block); etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30); } else neloc = *eloc; nelen = (etype << 30) | nelen; } if (elen != nelen) { udf_write_aext(inode, epos, &neloc, nelen, 0); if (last_block > first_block) { if (etype == (EXT_RECORDED_ALLOCATED >> 30)) mark_inode_dirty(inode); if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) udf_free_blocks(inode->i_sb, inode, eloc, first_block, last_block - first_block); } } } /* * Truncate the last extent to match i_size. This function assumes * that preallocation extent is already truncated. */ void udf_truncate_tail_extent(struct inode *inode) { struct extent_position epos = {}; struct kernel_lb_addr eloc; uint32_t elen, nelen; uint64_t lbcount = 0; int8_t etype = -1, netype; int adsize; struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || inode->i_size == iinfo->i_lenExtents) return; /* Are we going to delete the file anyway? */ if (inode->i_nlink == 0) return; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else BUG(); /* Find the last extent in the file */ while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { etype = netype; lbcount += elen; if (lbcount > inode->i_size) { if (lbcount - inode->i_size >= inode->i_sb->s_blocksize) udf_warn(inode->i_sb, "Too long extent after EOF in inode %u: i_size: %lld lbcount: %lld extent %u+%u\n", (unsigned)inode->i_ino, (long long)inode->i_size, (long long)lbcount, (unsigned)eloc.logicalBlockNum, (unsigned)elen); nelen = elen - (lbcount - inode->i_size); epos.offset -= adsize; extent_trunc(inode, &epos, &eloc, etype, elen, nelen); epos.offset += adsize; if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1) udf_err(inode->i_sb, "Extent after EOF in inode %u\n", (unsigned)inode->i_ino); break; } } /* This inode entry is in-memory only and thus we don't have to mark * the inode dirty */ iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); } void udf_discard_prealloc(struct inode *inode) { struct extent_position epos = {}; struct extent_position prev_epos = {}; struct kernel_lb_addr eloc; uint32_t elen; uint64_t lbcount = 0; int8_t etype = -1; struct udf_inode_info *iinfo = UDF_I(inode); int bsize = i_blocksize(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || ALIGN(inode->i_size, bsize) == ALIGN(iinfo->i_lenExtents, bsize)) return; epos.block = iinfo->i_location; /* Find the last extent in the file */ while (udf_next_aext(inode, &epos, &eloc, &elen, 0) != -1) { brelse(prev_epos.bh); prev_epos = epos; if (prev_epos.bh) get_bh(prev_epos.bh); etype = udf_next_aext(inode, &epos, &eloc, &elen, 1); lbcount += elen; } if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { lbcount -= elen; udf_delete_aext(inode, prev_epos); udf_free_blocks(inode->i_sb, inode, &eloc, 0, DIV_ROUND_UP(elen, bsize)); } /* This inode entry is in-memory only and thus we don't have to mark * the inode dirty */ iinfo->i_lenExtents = lbcount; brelse(epos.bh); brelse(prev_epos.bh); } static void udf_update_alloc_ext_desc(struct inode *inode, struct extent_position *epos, u32 lenalloc) { struct super_block *sb = inode->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); struct allocExtDesc *aed = (struct allocExtDesc *) (epos->bh->b_data); int len = sizeof(struct allocExtDesc); aed->lengthAllocDescs = cpu_to_le32(lenalloc); if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) || sbi->s_udfrev >= 0x0201) len += lenalloc; udf_update_tag(epos->bh->b_data, len); mark_buffer_dirty_inode(epos->bh, inode); } /* * Truncate extents of inode to inode->i_size. This function can be used only * for making file shorter. For making file longer, udf_extend_file() has to * be used. */ int udf_truncate_extents(struct inode *inode) { struct extent_position epos; struct kernel_lb_addr eloc, neloc = {}; uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc; int8_t etype; struct super_block *sb = inode->i_sb; sector_t first_block = inode->i_size >> sb->s_blocksize_bits, offset; loff_t byte_offset; int adsize; struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else BUG(); etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); byte_offset = (offset << sb->s_blocksize_bits) + (inode->i_size & (sb->s_blocksize - 1)); if (etype == -1) { /* We should extend the file? */ WARN_ON(byte_offset); return 0; } epos.offset -= adsize; extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); epos.offset += adsize; if (byte_offset) lenalloc = epos.offset; else lenalloc = epos.offset - adsize; if (!epos.bh) lenalloc -= udf_file_entry_alloc_offset(inode); else lenalloc -= sizeof(struct allocExtDesc); while ((etype = udf_current_aext(inode, &epos, &eloc, &elen, 0)) != -1) { if (etype == (EXT_NEXT_EXTENT_ALLOCDESCS >> 30)) { udf_write_aext(inode, &epos, &neloc, nelen, 0); if (indirect_ext_len) { /* We managed to free all extents in the * indirect extent - free it too */ BUG_ON(!epos.bh); udf_free_blocks(sb, NULL, &epos.block, 0, indirect_ext_len); } else if (!epos.bh) { iinfo->i_lenAlloc = lenalloc; mark_inode_dirty(inode); } else udf_update_alloc_ext_desc(inode, &epos, lenalloc); brelse(epos.bh); epos.offset = sizeof(struct allocExtDesc); epos.block = eloc; epos.bh = sb_bread(sb, udf_get_lb_pblock(sb, &eloc, 0)); /* Error reading indirect block? */ if (!epos.bh) return -EIO; if (elen) indirect_ext_len = (elen + sb->s_blocksize - 1) >> sb->s_blocksize_bits; else indirect_ext_len = 1; } else { extent_trunc(inode, &epos, &eloc, etype, elen, 0); epos.offset += adsize; } } if (indirect_ext_len) { BUG_ON(!epos.bh); udf_free_blocks(sb, NULL, &epos.block, 0, indirect_ext_len); } else if (!epos.bh) { iinfo->i_lenAlloc = lenalloc; mark_inode_dirty(inode); } else udf_update_alloc_ext_desc(inode, &epos, lenalloc); iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); return 0; }
42 40 2 34 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 // SPDX-License-Identifier: GPL-2.0-only /* loopback transport for vsock using virtio_transport_common APIs * * Copyright (C) 2013-2019 Red Hat, Inc. * Authors: Asias He <asias@redhat.com> * Stefan Hajnoczi <stefanha@redhat.com> * Stefano Garzarella <sgarzare@redhat.com> * */ #include <linux/spinlock.h> #include <linux/module.h> #include <linux/list.h> #include <linux/virtio_vsock.h> struct vsock_loopback { struct workqueue_struct *workqueue; struct sk_buff_head pkt_queue; struct work_struct pkt_work; }; static struct vsock_loopback the_vsock_loopback; static u32 vsock_loopback_get_local_cid(void) { return VMADDR_CID_LOCAL; } static int vsock_loopback_send_pkt(struct sk_buff *skb) { struct vsock_loopback *vsock = &the_vsock_loopback; int len = skb->len; virtio_vsock_skb_queue_tail(&vsock->pkt_queue, skb); queue_work(vsock->workqueue, &vsock->pkt_work); return len; } static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) { struct vsock_loopback *vsock = &the_vsock_loopback; virtio_transport_purge_skbs(vsk, &vsock->pkt_queue); return 0; } static bool vsock_loopback_seqpacket_allow(u32 remote_cid); static bool vsock_loopback_msgzerocopy_allow(void) { return true; } static struct virtio_transport loopback_transport = { .transport = { .module = THIS_MODULE, .get_local_cid = vsock_loopback_get_local_cid, .init = virtio_transport_do_socket_init, .destruct = virtio_transport_destruct, .release = virtio_transport_release, .connect = virtio_transport_connect, .shutdown = virtio_transport_shutdown, .cancel_pkt = vsock_loopback_cancel_pkt, .dgram_bind = virtio_transport_dgram_bind, .dgram_dequeue = virtio_transport_dgram_dequeue, .dgram_enqueue = virtio_transport_dgram_enqueue, .dgram_allow = virtio_transport_dgram_allow, .stream_dequeue = virtio_transport_stream_dequeue, .stream_enqueue = virtio_transport_stream_enqueue, .stream_has_data = virtio_transport_stream_has_data, .stream_has_space = virtio_transport_stream_has_space, .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, .stream_is_active = virtio_transport_stream_is_active, .stream_allow = virtio_transport_stream_allow, .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, .seqpacket_allow = vsock_loopback_seqpacket_allow, .seqpacket_has_data = virtio_transport_seqpacket_has_data, .msgzerocopy_allow = vsock_loopback_msgzerocopy_allow, .notify_poll_in = virtio_transport_notify_poll_in, .notify_poll_out = virtio_transport_notify_poll_out, .notify_recv_init = virtio_transport_notify_recv_init, .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, .notify_send_init = virtio_transport_notify_send_init, .notify_send_pre_block = virtio_transport_notify_send_pre_block, .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, .read_skb = virtio_transport_read_skb, }, .send_pkt = vsock_loopback_send_pkt, }; static bool vsock_loopback_seqpacket_allow(u32 remote_cid) { return true; } static void vsock_loopback_work(struct work_struct *work) { struct vsock_loopback *vsock = container_of(work, struct vsock_loopback, pkt_work); struct sk_buff_head pkts; struct sk_buff *skb; skb_queue_head_init(&pkts); spin_lock_bh(&vsock->pkt_queue.lock); skb_queue_splice_init(&vsock->pkt_queue, &pkts); spin_unlock_bh(&vsock->pkt_queue.lock); while ((skb = __skb_dequeue(&pkts))) { virtio_transport_deliver_tap_pkt(skb); virtio_transport_recv_pkt(&loopback_transport, skb); } } static int __init vsock_loopback_init(void) { struct vsock_loopback *vsock = &the_vsock_loopback; int ret; vsock->workqueue = alloc_workqueue("vsock-loopback", 0, 0); if (!vsock->workqueue) return -ENOMEM; skb_queue_head_init(&vsock->pkt_queue); INIT_WORK(&vsock->pkt_work, vsock_loopback_work); ret = vsock_core_register(&loopback_transport.transport, VSOCK_TRANSPORT_F_LOCAL); if (ret) goto out_wq; return 0; out_wq: destroy_workqueue(vsock->workqueue); return ret; } static void __exit vsock_loopback_exit(void) { struct vsock_loopback *vsock = &the_vsock_loopback; vsock_core_unregister(&loopback_transport.transport); flush_work(&vsock->pkt_work); virtio_vsock_skb_queue_purge(&vsock->pkt_queue); destroy_workqueue(vsock->workqueue); } module_init(vsock_loopback_init); module_exit(vsock_loopback_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Stefano Garzarella <sgarzare@redhat.com>"); MODULE_DESCRIPTION("loopback transport for vsock"); MODULE_ALIAS_NETPROTO(PF_VSOCK);
47 1 46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2011 Florian Westphal <fw@strlen.de> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/route.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <linux/netfilter/xt_rpfilter.h> #include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_DESCRIPTION("Xtables: IPv6 reverse path filter match"); static bool rpfilter_addr_unicast(const struct in6_addr *addr) { int addr_type = ipv6_addr_type(addr); return addr_type & IPV6_ADDR_UNICAST; } static bool rpfilter_addr_linklocal(const struct in6_addr *addr) { int addr_type = ipv6_addr_type(addr); return addr_type & IPV6_ADDR_LINKLOCAL; } static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, const struct net_device *dev, u8 flags) { struct rt6_info *rt; struct ipv6hdr *iph = ipv6_hdr(skb); bool ret = false; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev), .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK, .flowi6_proto = iph->nexthdr, .flowi6_uid = sock_net_uid(net, NULL), .daddr = iph->saddr, }; int lookup_flags; if (rpfilter_addr_unicast(&iph->daddr)) { memcpy(&fl6.saddr, &iph->daddr, sizeof(struct in6_addr)); lookup_flags = RT6_LOOKUP_F_HAS_SADDR; } else { lookup_flags = 0; } fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; if (rpfilter_addr_linklocal(&iph->saddr)) { lookup_flags |= RT6_LOOKUP_F_IFACE; fl6.flowi6_oif = dev->ifindex; } else if ((flags & XT_RPFILTER_LOOSE) == 0) fl6.flowi6_oif = dev->ifindex; rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags); if (rt->dst.error) goto out; if (rt->rt6i_flags & (RTF_REJECT|RTF_ANYCAST)) goto out; if (rt->rt6i_flags & RTF_LOCAL) { ret = flags & XT_RPFILTER_ACCEPT_LOCAL; goto out; } if (rt->rt6i_idev->dev == dev || l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == dev->ifindex || (flags & XT_RPFILTER_LOOSE)) ret = true; out: ip6_rt_put(rt); return ret; } static bool rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in) { return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rpfilter_info *info = par->matchinfo; int saddrtype; struct ipv6hdr *iph; bool invert = info->flags & XT_RPFILTER_INVERT; if (rpfilter_is_loopback(skb, xt_in(par))) return true ^ invert; iph = ipv6_hdr(skb); saddrtype = ipv6_addr_type(&iph->saddr); if (unlikely(saddrtype == IPV6_ADDR_ANY)) return true ^ invert; /* not routable: forward path will drop it */ return rpfilter_lookup_reverse6(xt_net(par), skb, xt_in(par), info->flags) ^ invert; } static int rpfilter_check(const struct xt_mtchk_param *par) { const struct xt_rpfilter_info *info = par->matchinfo; unsigned int options = ~XT_RPFILTER_OPTION_MASK; if (info->flags & options) { pr_info_ratelimited("unknown options\n"); return -EINVAL; } if (strcmp(par->table, "mangle") != 0 && strcmp(par->table, "raw") != 0) { pr_info_ratelimited("only valid in \'raw\' or \'mangle\' table, not \'%s\'\n", par->table); return -EINVAL; } return 0; } static struct xt_match rpfilter_mt_reg __read_mostly = { .name = "rpfilter", .family = NFPROTO_IPV6, .checkentry = rpfilter_check, .match = rpfilter_mt, .matchsize = sizeof(struct xt_rpfilter_info), .hooks = (1 << NF_INET_PRE_ROUTING), .me = THIS_MODULE }; static int __init rpfilter_mt_init(void) { return xt_register_match(&rpfilter_mt_reg); } static void __exit rpfilter_mt_exit(void) { xt_unregister_match(&rpfilter_mt_reg); } module_init(rpfilter_mt_init); module_exit(rpfilter_mt_exit);
15 41 2 241 230 4 23 27 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 // SPDX-License-Identifier: GPL-2.0-or-later /* Key garbage collector * * Copyright (C) 2009-2011 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/slab.h> #include <linux/security.h> #include <keys/keyring-type.h> #include "internal.h" /* * Delay between key revocation/expiry in seconds */ unsigned key_gc_delay = 5 * 60; /* * Reaper for unused keys. */ static void key_garbage_collector(struct work_struct *work); DECLARE_WORK(key_gc_work, key_garbage_collector); /* * Reaper for links from keyrings to dead keys. */ static void key_gc_timer_func(struct timer_list *); static DEFINE_TIMER(key_gc_timer, key_gc_timer_func); static time64_t key_gc_next_run = TIME64_MAX; static struct key_type *key_gc_dead_keytype; static unsigned long key_gc_flags; #define KEY_GC_KEY_EXPIRED 0 /* A key expired and needs unlinking */ #define KEY_GC_REAP_KEYTYPE 1 /* A keytype is being unregistered */ #define KEY_GC_REAPING_KEYTYPE 2 /* Cleared when keytype reaped */ /* * Any key whose type gets unregistered will be re-typed to this if it can't be * immediately unlinked. */ struct key_type key_type_dead = { .name = ".dead", }; /* * Schedule a garbage collection run. * - time precision isn't particularly important */ void key_schedule_gc(time64_t gc_at) { unsigned long expires; time64_t now = ktime_get_real_seconds(); kenter("%lld", gc_at - now); if (gc_at <= now || test_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags)) { kdebug("IMMEDIATE"); schedule_work(&key_gc_work); } else if (gc_at < key_gc_next_run) { kdebug("DEFERRED"); key_gc_next_run = gc_at; expires = jiffies + (gc_at - now) * HZ; mod_timer(&key_gc_timer, expires); } } /* * Set the expiration time on a key. */ void key_set_expiry(struct key *key, time64_t expiry) { key->expiry = expiry; if (expiry != TIME64_MAX) { if (!(key->type->flags & KEY_TYPE_INSTANT_REAP)) expiry += key_gc_delay; key_schedule_gc(expiry); } } /* * Schedule a dead links collection run. */ void key_schedule_gc_links(void) { set_bit(KEY_GC_KEY_EXPIRED, &key_gc_flags); schedule_work(&key_gc_work); } /* * Some key's cleanup time was met after it expired, so we need to get the * reaper to go through a cycle finding expired keys. */ static void key_gc_timer_func(struct timer_list *unused) { kenter(""); key_gc_next_run = TIME64_MAX; key_schedule_gc_links(); } /* * Reap keys of dead type. * * We use three flags to make sure we see three complete cycles of the garbage * collector: the first to mark keys of that type as being dead, the second to * collect dead links and the third to clean up the dead keys. We have to be * careful as there may already be a cycle in progress. * * The caller must be holding key_types_sem. */ void key_gc_keytype(struct key_type *ktype) { kenter("%s", ktype->name); key_gc_dead_keytype = ktype; set_bit(KEY_GC_REAPING_KEYTYPE, &key_gc_flags); smp_mb(); set_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags); kdebug("schedule"); schedule_work(&key_gc_work); kdebug("sleep"); wait_on_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE, TASK_UNINTERRUPTIBLE); key_gc_dead_keytype = NULL; kleave(""); } /* * Garbage collect a list of unreferenced, detached keys */ static noinline void key_gc_unused_keys(struct list_head *keys) { while (!list_empty(keys)) { struct key *key = list_entry(keys->next, struct key, graveyard_link); short state = key->state; list_del(&key->graveyard_link); kdebug("- %u", key->serial); key_check(key); #ifdef CONFIG_KEY_NOTIFICATIONS remove_watch_list(key->watchers, key->serial); key->watchers = NULL; #endif /* Throw away the key data if the key is instantiated */ if (state == KEY_IS_POSITIVE && key->type->destroy) key->type->destroy(key); security_key_free(key); /* deal with the user's key tracking and quota */ if (test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) { spin_lock(&key->user->lock); key->user->qnkeys--; key->user->qnbytes -= key->quotalen; spin_unlock(&key->user->lock); } atomic_dec(&key->user->nkeys); if (state != KEY_IS_UNINSTANTIATED) atomic_dec(&key->user->nikeys); key_user_put(key->user); key_put_tag(key->domain_tag); kfree(key->description); memzero_explicit(key, sizeof(*key)); kmem_cache_free(key_jar, key); } } /* * Garbage collector for unused keys. * * This is done in process context so that we don't have to disable interrupts * all over the place. key_put() schedules this rather than trying to do the * cleanup itself, which means key_put() doesn't have to sleep. */ static void key_garbage_collector(struct work_struct *work) { static LIST_HEAD(graveyard); static u8 gc_state; /* Internal persistent state */ #define KEY_GC_REAP_AGAIN 0x01 /* - Need another cycle */ #define KEY_GC_REAPING_LINKS 0x02 /* - We need to reap links */ #define KEY_GC_REAPING_DEAD_1 0x10 /* - We need to mark dead keys */ #define KEY_GC_REAPING_DEAD_2 0x20 /* - We need to reap dead key links */ #define KEY_GC_REAPING_DEAD_3 0x40 /* - We need to reap dead keys */ #define KEY_GC_FOUND_DEAD_KEY 0x80 /* - We found at least one dead key */ struct rb_node *cursor; struct key *key; time64_t new_timer, limit, expiry; kenter("[%lx,%x]", key_gc_flags, gc_state); limit = ktime_get_real_seconds(); /* Work out what we're going to be doing in this pass */ gc_state &= KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2; gc_state <<= 1; if (test_and_clear_bit(KEY_GC_KEY_EXPIRED, &key_gc_flags)) gc_state |= KEY_GC_REAPING_LINKS; if (test_and_clear_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags)) gc_state |= KEY_GC_REAPING_DEAD_1; kdebug("new pass %x", gc_state); new_timer = TIME64_MAX; /* As only this function is permitted to remove things from the key * serial tree, if cursor is non-NULL then it will always point to a * valid node in the tree - even if lock got dropped. */ spin_lock(&key_serial_lock); cursor = rb_first(&key_serial_tree); continue_scanning: while (cursor) { key = rb_entry(cursor, struct key, serial_node); cursor = rb_next(cursor); if (refcount_read(&key->usage) == 0) goto found_unreferenced_key; if (unlikely(gc_state & KEY_GC_REAPING_DEAD_1)) { if (key->type == key_gc_dead_keytype) { gc_state |= KEY_GC_FOUND_DEAD_KEY; set_bit(KEY_FLAG_DEAD, &key->flags); key->perm = 0; goto skip_dead_key; } else if (key->type == &key_type_keyring && key->restrict_link) { goto found_restricted_keyring; } } expiry = key->expiry; if (expiry != TIME64_MAX) { if (!(key->type->flags & KEY_TYPE_INSTANT_REAP)) expiry += key_gc_delay; if (expiry > limit && expiry < new_timer) { kdebug("will expire %x in %lld", key_serial(key), key->expiry - limit); new_timer = key->expiry; } } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_2)) if (key->type == key_gc_dead_keytype) gc_state |= KEY_GC_FOUND_DEAD_KEY; if ((gc_state & KEY_GC_REAPING_LINKS) || unlikely(gc_state & KEY_GC_REAPING_DEAD_2)) { if (key->type == &key_type_keyring) goto found_keyring; } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_3)) if (key->type == key_gc_dead_keytype) goto destroy_dead_key; skip_dead_key: if (spin_is_contended(&key_serial_lock) || need_resched()) goto contended; } contended: spin_unlock(&key_serial_lock); maybe_resched: if (cursor) { cond_resched(); spin_lock(&key_serial_lock); goto continue_scanning; } /* We've completed the pass. Set the timer if we need to and queue a * new cycle if necessary. We keep executing cycles until we find one * where we didn't reap any keys. */ kdebug("pass complete"); if (new_timer != TIME64_MAX) { new_timer += key_gc_delay; key_schedule_gc(new_timer); } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_2) || !list_empty(&graveyard)) { /* Make sure that all pending keyring payload destructions are * fulfilled and that people aren't now looking at dead or * dying keys that they don't have a reference upon or a link * to. */ kdebug("gc sync"); synchronize_rcu(); } if (!list_empty(&graveyard)) { kdebug("gc keys"); key_gc_unused_keys(&graveyard); } if (unlikely(gc_state & (KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2))) { if (!(gc_state & KEY_GC_FOUND_DEAD_KEY)) { /* No remaining dead keys: short circuit the remaining * keytype reap cycles. */ kdebug("dead short"); gc_state &= ~(KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2); gc_state |= KEY_GC_REAPING_DEAD_3; } else { gc_state |= KEY_GC_REAP_AGAIN; } } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_3)) { kdebug("dead wake"); smp_mb(); clear_bit(KEY_GC_REAPING_KEYTYPE, &key_gc_flags); wake_up_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE); } if (gc_state & KEY_GC_REAP_AGAIN) schedule_work(&key_gc_work); kleave(" [end %x]", gc_state); return; /* We found an unreferenced key - once we've removed it from the tree, * we can safely drop the lock. */ found_unreferenced_key: kdebug("unrefd key %d", key->serial); rb_erase(&key->serial_node, &key_serial_tree); spin_unlock(&key_serial_lock); list_add_tail(&key->graveyard_link, &graveyard); gc_state |= KEY_GC_REAP_AGAIN; goto maybe_resched; /* We found a restricted keyring and need to update the restriction if * it is associated with the dead key type. */ found_restricted_keyring: spin_unlock(&key_serial_lock); keyring_restriction_gc(key, key_gc_dead_keytype); goto maybe_resched; /* We found a keyring and we need to check the payload for links to * dead or expired keys. We don't flag another reap immediately as we * have to wait for the old payload to be destroyed by RCU before we * can reap the keys to which it refers. */ found_keyring: spin_unlock(&key_serial_lock); keyring_gc(key, limit); goto maybe_resched; /* We found a dead key that is still referenced. Reset its type and * destroy its payload with its semaphore held. */ destroy_dead_key: spin_unlock(&key_serial_lock); kdebug("destroy key %d", key->serial); down_write(&key->sem); key->type = &key_type_dead; if (key_gc_dead_keytype->destroy) key_gc_dead_keytype->destroy(key); memset(&key->payload, KEY_DESTROY, sizeof(key->payload)); up_write(&key->sem); goto maybe_resched; }
53 62 23 2 72 38 13 25 74 74 73 73 42 28 28 22 22 88 63 94 64 15 2 8 10 50 15 15 50 50 42 17 17 17 17 17 60 28 62 35 95 95 21 95 58 58 9 52 30 56 64 16 16 30 2 46 62 2 57 7 41 59 54 143 117 12 65 3 1 96 87 1 59 10 44 95 78 37 83 49 17 17 87 4 27 6 26 26 4 1 5 1 46 46 113 113 50 22 85 49 114 29 103 103 32 29 29 32 32 2 18 18 18 4 76 8 49 66 2 64 15 10 1 2 2 2 5 5 5 5 2 1 1 1 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 /* * net/tipc/group.c: TIPC group messaging code * * Copyright (c) 2017, Ericsson AB * Copyright (c) 2020, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "addr.h" #include "group.h" #include "bcast.h" #include "topsrv.h" #include "msg.h" #include "socket.h" #include "node.h" #include "name_table.h" #include "subscr.h" #define ADV_UNIT (((MAX_MSG_SIZE + MAX_H_SIZE) / FLOWCTL_BLK_SZ) + 1) #define ADV_IDLE ADV_UNIT #define ADV_ACTIVE (ADV_UNIT * 12) enum mbr_state { MBR_JOINING, MBR_PUBLISHED, MBR_JOINED, MBR_PENDING, MBR_ACTIVE, MBR_RECLAIMING, MBR_REMITTED, MBR_LEAVING }; struct tipc_member { struct rb_node tree_node; struct list_head list; struct list_head small_win; struct sk_buff_head deferredq; struct tipc_group *group; u32 node; u32 port; u32 instance; enum mbr_state state; u16 advertised; u16 window; u16 bc_rcv_nxt; u16 bc_syncpt; u16 bc_acked; }; struct tipc_group { struct rb_root members; struct list_head small_win; struct list_head pending; struct list_head active; struct tipc_nlist dests; struct net *net; int subid; u32 type; u32 instance; u32 scope; u32 portid; u16 member_cnt; u16 active_cnt; u16 max_active; u16 bc_snd_nxt; u16 bc_ackers; bool *open; bool loopback; bool events; }; static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq); static void tipc_group_open(struct tipc_member *m, bool *wakeup) { *wakeup = false; if (list_empty(&m->small_win)) return; list_del_init(&m->small_win); *m->group->open = true; *wakeup = true; } static void tipc_group_decr_active(struct tipc_group *grp, struct tipc_member *m) { if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING || m->state == MBR_REMITTED) grp->active_cnt--; } static int tipc_group_rcvbuf_limit(struct tipc_group *grp) { int max_active, active_pool, idle_pool; int mcnt = grp->member_cnt + 1; /* Limit simultaneous reception from other members */ max_active = min(mcnt / 8, 64); max_active = max(max_active, 16); grp->max_active = max_active; /* Reserve blocks for active and idle members */ active_pool = max_active * ADV_ACTIVE; idle_pool = (mcnt - max_active) * ADV_IDLE; /* Scale to bytes, considering worst-case truesize/msgsize ratio */ return (active_pool + idle_pool) * FLOWCTL_BLK_SZ * 4; } u16 tipc_group_bc_snd_nxt(struct tipc_group *grp) { return grp->bc_snd_nxt; } static bool tipc_group_is_receiver(struct tipc_member *m) { return m && m->state != MBR_JOINING && m->state != MBR_LEAVING; } static bool tipc_group_is_sender(struct tipc_member *m) { return m && m->state != MBR_JOINING && m->state != MBR_PUBLISHED; } u32 tipc_group_exclude(struct tipc_group *grp) { if (!grp->loopback) return grp->portid; return 0; } struct tipc_group *tipc_group_create(struct net *net, u32 portid, struct tipc_group_req *mreq, bool *group_is_open) { u32 filter = TIPC_SUB_PORTS | TIPC_SUB_NO_STATUS; bool global = mreq->scope != TIPC_NODE_SCOPE; struct tipc_group *grp; u32 type = mreq->type; grp = kzalloc(sizeof(*grp), GFP_ATOMIC); if (!grp) return NULL; tipc_nlist_init(&grp->dests, tipc_own_addr(net)); INIT_LIST_HEAD(&grp->small_win); INIT_LIST_HEAD(&grp->active); INIT_LIST_HEAD(&grp->pending); grp->members = RB_ROOT; grp->net = net; grp->portid = portid; grp->type = type; grp->instance = mreq->instance; grp->scope = mreq->scope; grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK; grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS; grp->open = group_is_open; *grp->open = false; filter |= global ? TIPC_SUB_CLUSTER_SCOPE : TIPC_SUB_NODE_SCOPE; if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, filter, &grp->subid)) return grp; kfree(grp); return NULL; } void tipc_group_join(struct net *net, struct tipc_group *grp, int *sk_rcvbuf) { struct rb_root *tree = &grp->members; struct tipc_member *m, *tmp; struct sk_buff_head xmitq; __skb_queue_head_init(&xmitq); rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, &xmitq); tipc_group_update_member(m, 0); } tipc_node_distr_xmit(net, &xmitq); *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } void tipc_group_delete(struct net *net, struct tipc_group *grp) { struct rb_root *tree = &grp->members; struct tipc_member *m, *tmp; struct sk_buff_head xmitq; __skb_queue_head_init(&xmitq); rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { tipc_group_proto_xmit(grp, m, GRP_LEAVE_MSG, &xmitq); __skb_queue_purge(&m->deferredq); list_del(&m->list); kfree(m); } tipc_node_distr_xmit(net, &xmitq); tipc_nlist_purge(&grp->dests); tipc_topsrv_kern_unsubscr(net, grp->subid); kfree(grp); } static struct tipc_member *tipc_group_find_member(struct tipc_group *grp, u32 node, u32 port) { struct rb_node *n = grp->members.rb_node; u64 nkey, key = (u64)node << 32 | port; struct tipc_member *m; while (n) { m = container_of(n, struct tipc_member, tree_node); nkey = (u64)m->node << 32 | m->port; if (key < nkey) n = n->rb_left; else if (key > nkey) n = n->rb_right; else return m; } return NULL; } static struct tipc_member *tipc_group_find_dest(struct tipc_group *grp, u32 node, u32 port) { struct tipc_member *m; m = tipc_group_find_member(grp, node, port); if (m && tipc_group_is_receiver(m)) return m; return NULL; } static struct tipc_member *tipc_group_find_node(struct tipc_group *grp, u32 node) { struct tipc_member *m; struct rb_node *n; for (n = rb_first(&grp->members); n; n = rb_next(n)) { m = container_of(n, struct tipc_member, tree_node); if (m->node == node) return m; } return NULL; } static int tipc_group_add_to_tree(struct tipc_group *grp, struct tipc_member *m) { u64 nkey, key = (u64)m->node << 32 | m->port; struct rb_node **n, *parent = NULL; struct tipc_member *tmp; n = &grp->members.rb_node; while (*n) { tmp = container_of(*n, struct tipc_member, tree_node); parent = *n; tmp = container_of(parent, struct tipc_member, tree_node); nkey = (u64)tmp->node << 32 | tmp->port; if (key < nkey) n = &(*n)->rb_left; else if (key > nkey) n = &(*n)->rb_right; else return -EEXIST; } rb_link_node(&m->tree_node, parent, n); rb_insert_color(&m->tree_node, &grp->members); return 0; } static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, u32 node, u32 port, u32 instance, int state) { struct tipc_member *m; int ret; m = kzalloc(sizeof(*m), GFP_ATOMIC); if (!m) return NULL; INIT_LIST_HEAD(&m->list); INIT_LIST_HEAD(&m->small_win); __skb_queue_head_init(&m->deferredq); m->group = grp; m->node = node; m->port = port; m->instance = instance; m->bc_acked = grp->bc_snd_nxt - 1; ret = tipc_group_add_to_tree(grp, m); if (ret < 0) { kfree(m); return NULL; } grp->member_cnt++; tipc_nlist_add(&grp->dests, m->node); m->state = state; return m; } void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port, u32 instance) { tipc_group_create_member(grp, node, port, instance, MBR_PUBLISHED); } static void tipc_group_delete_member(struct tipc_group *grp, struct tipc_member *m) { rb_erase(&m->tree_node, &grp->members); grp->member_cnt--; /* Check if we were waiting for replicast ack from this member */ if (grp->bc_ackers && less(m->bc_acked, grp->bc_snd_nxt - 1)) grp->bc_ackers--; list_del_init(&m->list); list_del_init(&m->small_win); tipc_group_decr_active(grp, m); /* If last member on a node, remove node from dest list */ if (!tipc_group_find_node(grp, m->node)) tipc_nlist_del(&grp->dests, m->node); kfree(m); } struct tipc_nlist *tipc_group_dests(struct tipc_group *grp) { return &grp->dests; } void tipc_group_self(struct tipc_group *grp, struct tipc_service_range *seq, int *scope) { seq->type = grp->type; seq->lower = grp->instance; seq->upper = grp->instance; *scope = grp->scope; } void tipc_group_update_member(struct tipc_member *m, int len) { struct tipc_group *grp = m->group; struct tipc_member *_m, *tmp; if (!tipc_group_is_receiver(m)) return; m->window -= len; if (m->window >= ADV_IDLE) return; list_del_init(&m->small_win); /* Sort member into small_window members' list */ list_for_each_entry_safe(_m, tmp, &grp->small_win, small_win) { if (_m->window > m->window) break; } list_add_tail(&m->small_win, &_m->small_win); } void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack) { u16 prev = grp->bc_snd_nxt - 1; struct tipc_member *m; struct rb_node *n; u16 ackers = 0; for (n = rb_first(&grp->members); n; n = rb_next(n)) { m = container_of(n, struct tipc_member, tree_node); if (tipc_group_is_receiver(m)) { tipc_group_update_member(m, len); m->bc_acked = prev; ackers++; } } /* Mark number of acknowledges to expect, if any */ if (ack) grp->bc_ackers = ackers; grp->bc_snd_nxt++; } bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, int len, struct tipc_member **mbr) { struct sk_buff_head xmitq; struct tipc_member *m; int adv, state; m = tipc_group_find_dest(grp, dnode, dport); if (!tipc_group_is_receiver(m)) { *mbr = NULL; return false; } *mbr = m; if (m->window >= len) return false; *grp->open = false; /* If not fully advertised, do it now to prevent mutual blocking */ adv = m->advertised; state = m->state; if (state == MBR_JOINED && adv == ADV_IDLE) return true; if (state == MBR_ACTIVE && adv == ADV_ACTIVE) return true; if (state == MBR_PENDING && adv == ADV_IDLE) return true; __skb_queue_head_init(&xmitq); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, &xmitq); tipc_node_distr_xmit(grp->net, &xmitq); return true; } bool tipc_group_bc_cong(struct tipc_group *grp, int len) { struct tipc_member *m = NULL; /* If prev bcast was replicast, reject until all receivers have acked */ if (grp->bc_ackers) { *grp->open = false; return true; } if (list_empty(&grp->small_win)) return false; m = list_first_entry(&grp->small_win, struct tipc_member, small_win); if (m->window >= len) return false; return tipc_group_cong(grp, m->node, m->port, len, &m); } /* tipc_group_sort_msg() - sort msg into queue by bcast sequence number */ static void tipc_group_sort_msg(struct sk_buff *skb, struct sk_buff_head *defq) { struct tipc_msg *_hdr, *hdr = buf_msg(skb); u16 bc_seqno = msg_grp_bc_seqno(hdr); struct sk_buff *_skb, *tmp; int mtyp = msg_type(hdr); /* Bcast/mcast may be bypassed by ucast or other bcast, - sort it in */ if (mtyp == TIPC_GRP_BCAST_MSG || mtyp == TIPC_GRP_MCAST_MSG) { skb_queue_walk_safe(defq, _skb, tmp) { _hdr = buf_msg(_skb); if (!less(bc_seqno, msg_grp_bc_seqno(_hdr))) continue; __skb_queue_before(defq, _skb, skb); return; } /* Bcast was not bypassed, - add to tail */ } /* Unicasts are never bypassed, - always add to tail */ __skb_queue_tail(defq, skb); } /* tipc_group_filter_msg() - determine if we should accept arriving message */ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct sk_buff *skb = __skb_dequeue(inputq); bool ack, deliver, update, leave = false; struct sk_buff_head *defq; struct tipc_member *m; struct tipc_msg *hdr; u32 node, port; int mtyp, blks; if (!skb) return; hdr = buf_msg(skb); node = msg_orignode(hdr); port = msg_origport(hdr); if (!msg_in_group(hdr)) goto drop; m = tipc_group_find_member(grp, node, port); if (!tipc_group_is_sender(m)) goto drop; if (less(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) goto drop; TIPC_SKB_CB(skb)->orig_member = m->instance; defq = &m->deferredq; tipc_group_sort_msg(skb, defq); while ((skb = skb_peek(defq))) { hdr = buf_msg(skb); mtyp = msg_type(hdr); blks = msg_blocks(hdr); deliver = true; ack = false; update = false; if (more(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) break; /* Decide what to do with message */ switch (mtyp) { case TIPC_GRP_MCAST_MSG: if (msg_nameinst(hdr) != grp->instance) { update = true; deliver = false; } fallthrough; case TIPC_GRP_BCAST_MSG: m->bc_rcv_nxt++; ack = msg_grp_bc_ack_req(hdr); break; case TIPC_GRP_UCAST_MSG: break; case TIPC_GRP_MEMBER_EVT: if (m->state == MBR_LEAVING) leave = true; if (!grp->events) deliver = false; break; default: break; } /* Execute decisions */ __skb_dequeue(defq); if (deliver) __skb_queue_tail(inputq, skb); else kfree_skb(skb); if (ack) tipc_group_proto_xmit(grp, m, GRP_ACK_MSG, xmitq); if (leave) { __skb_queue_purge(defq); tipc_group_delete_member(grp, m); break; } if (!update) continue; tipc_group_update_rcv_win(grp, blks, node, port, xmitq); } return; drop: kfree_skb(skb); } void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, u32 port, struct sk_buff_head *xmitq) { struct list_head *active = &grp->active; int max_active = grp->max_active; int reclaim_limit = max_active * 3 / 4; int active_cnt = grp->active_cnt; struct tipc_member *m, *rm, *pm; m = tipc_group_find_member(grp, node, port); if (!m) return; m->advertised -= blks; switch (m->state) { case MBR_JOINED: /* First, decide if member can go active */ if (active_cnt <= max_active) { m->state = MBR_ACTIVE; list_add_tail(&m->list, active); grp->active_cnt++; tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); } else { m->state = MBR_PENDING; list_add_tail(&m->list, &grp->pending); } if (active_cnt < reclaim_limit) break; /* Reclaim from oldest active member, if possible */ if (!list_empty(active)) { rm = list_first_entry(active, struct tipc_member, list); rm->state = MBR_RECLAIMING; list_del_init(&rm->list); tipc_group_proto_xmit(grp, rm, GRP_RECLAIM_MSG, xmitq); break; } /* Nobody to reclaim from; - revert oldest pending to JOINED */ pm = list_first_entry(&grp->pending, struct tipc_member, list); list_del_init(&pm->list); pm->state = MBR_JOINED; tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); break; case MBR_ACTIVE: if (!list_is_last(&m->list, &grp->active)) list_move_tail(&m->list, &grp->active); if (m->advertised > (ADV_ACTIVE * 3 / 4)) break; tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); break; case MBR_REMITTED: if (m->advertised > ADV_IDLE) break; m->state = MBR_JOINED; grp->active_cnt--; if (m->advertised < ADV_IDLE) { pr_warn_ratelimited("Rcv unexpected msg after REMIT\n"); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); } if (list_empty(&grp->pending)) return; /* Set oldest pending member to active and advertise */ pm = list_first_entry(&grp->pending, struct tipc_member, list); pm->state = MBR_ACTIVE; list_move_tail(&pm->list, &grp->active); grp->active_cnt++; tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); break; case MBR_RECLAIMING: case MBR_JOINING: case MBR_LEAVING: default: break; } } static void tipc_group_create_event(struct tipc_group *grp, struct tipc_member *m, u32 event, u16 seqno, struct sk_buff_head *inputq) { u32 dnode = tipc_own_addr(grp->net); struct tipc_event evt; struct sk_buff *skb; struct tipc_msg *hdr; memset(&evt, 0, sizeof(evt)); evt.event = event; evt.found_lower = m->instance; evt.found_upper = m->instance; evt.port.ref = m->port; evt.port.node = m->node; evt.s.seq.type = grp->type; evt.s.seq.lower = m->instance; evt.s.seq.upper = m->instance; skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_GRP_MEMBER_EVT, GROUP_H_SIZE, sizeof(evt), dnode, m->node, grp->portid, m->port, 0); if (!skb) return; hdr = buf_msg(skb); msg_set_nametype(hdr, grp->type); msg_set_grp_evt(hdr, event); msg_set_dest_droppable(hdr, true); msg_set_grp_bc_seqno(hdr, seqno); memcpy(msg_data(hdr), &evt, sizeof(evt)); TIPC_SKB_CB(skb)->orig_member = m->instance; __skb_queue_tail(inputq, skb); } static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, int mtyp, struct sk_buff_head *xmitq) { struct tipc_msg *hdr; struct sk_buff *skb; int adv = 0; skb = tipc_msg_create(GROUP_PROTOCOL, mtyp, INT_H_SIZE, 0, m->node, tipc_own_addr(grp->net), m->port, grp->portid, 0); if (!skb) return; if (m->state == MBR_ACTIVE) adv = ADV_ACTIVE - m->advertised; else if (m->state == MBR_JOINED || m->state == MBR_PENDING) adv = ADV_IDLE - m->advertised; hdr = buf_msg(skb); if (mtyp == GRP_JOIN_MSG) { msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); msg_set_adv_win(hdr, adv); m->advertised += adv; } else if (mtyp == GRP_LEAVE_MSG) { msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); } else if (mtyp == GRP_ADV_MSG) { msg_set_adv_win(hdr, adv); m->advertised += adv; } else if (mtyp == GRP_ACK_MSG) { msg_set_grp_bc_acked(hdr, m->bc_rcv_nxt); } else if (mtyp == GRP_REMIT_MSG) { msg_set_grp_remitted(hdr, m->window); } msg_set_dest_droppable(hdr, true); __skb_queue_tail(xmitq, skb); } void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { u32 node = msg_orignode(hdr); u32 port = msg_origport(hdr); struct tipc_member *m, *pm; u16 remitted, in_flight; if (!grp) return; if (grp->scope == TIPC_NODE_SCOPE && node != tipc_own_addr(grp->net)) return; m = tipc_group_find_member(grp, node, port); switch (msg_type(hdr)) { case GRP_JOIN_MSG: if (!m) m = tipc_group_create_member(grp, node, port, 0, MBR_JOINING); if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); m->bc_rcv_nxt = m->bc_syncpt; m->window += msg_adv_win(hdr); /* Wait until PUBLISH event is received if necessary */ if (m->state != MBR_PUBLISHED) return; /* Member can be taken into service */ m->state = MBR_JOINED; tipc_group_open(m, usr_wakeup); tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); tipc_group_create_event(grp, m, TIPC_PUBLISHED, m->bc_syncpt, inputq); return; case GRP_LEAVE_MSG: if (!m) return; m->bc_syncpt = msg_grp_bc_syncpt(hdr); list_del_init(&m->list); tipc_group_open(m, usr_wakeup); tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; tipc_group_create_event(grp, m, TIPC_WITHDRAWN, m->bc_syncpt, inputq); return; case GRP_ADV_MSG: if (!m) return; m->window += msg_adv_win(hdr); tipc_group_open(m, usr_wakeup); return; case GRP_ACK_MSG: if (!m) return; m->bc_acked = msg_grp_bc_acked(hdr); if (--grp->bc_ackers) return; list_del_init(&m->small_win); *m->group->open = true; *usr_wakeup = true; tipc_group_update_member(m, 0); return; case GRP_RECLAIM_MSG: if (!m) return; tipc_group_proto_xmit(grp, m, GRP_REMIT_MSG, xmitq); m->window = ADV_IDLE; tipc_group_open(m, usr_wakeup); return; case GRP_REMIT_MSG: if (!m || m->state != MBR_RECLAIMING) return; remitted = msg_grp_remitted(hdr); /* Messages preceding the REMIT still in receive queue */ if (m->advertised > remitted) { m->state = MBR_REMITTED; in_flight = m->advertised - remitted; m->advertised = ADV_IDLE + in_flight; return; } /* This should never happen */ if (m->advertised < remitted) pr_warn_ratelimited("Unexpected REMIT msg\n"); /* All messages preceding the REMIT have been read */ m->state = MBR_JOINED; grp->active_cnt--; m->advertised = ADV_IDLE; /* Set oldest pending member to active and advertise */ if (list_empty(&grp->pending)) return; pm = list_first_entry(&grp->pending, struct tipc_member, list); pm->state = MBR_ACTIVE; list_move_tail(&pm->list, &grp->active); grp->active_cnt++; if (pm->advertised <= (ADV_ACTIVE * 3 / 4)) tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); return; default: pr_warn("Received unknown GROUP_PROTO message\n"); } } /* tipc_group_member_evt() - receive and handle a member up/down event */ void tipc_group_member_evt(struct tipc_group *grp, bool *usr_wakeup, int *sk_rcvbuf, struct tipc_msg *hdr, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct tipc_event *evt = (void *)msg_data(hdr); u32 instance = evt->found_lower; u32 node = evt->port.node; u32 port = evt->port.ref; int event = evt->event; struct tipc_member *m; struct net *net; u32 self; if (!grp) return; net = grp->net; self = tipc_own_addr(net); if (!grp->loopback && node == self && port == grp->portid) return; m = tipc_group_find_member(grp, node, port); switch (event) { case TIPC_PUBLISHED: /* Send and wait for arrival of JOIN message if necessary */ if (!m) { m = tipc_group_create_member(grp, node, port, instance, MBR_PUBLISHED); if (!m) break; tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); break; } if (m->state != MBR_JOINING) break; /* Member can be taken into service */ m->instance = instance; m->state = MBR_JOINED; tipc_group_open(m, usr_wakeup); tipc_group_update_member(m, 0); tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); tipc_group_create_event(grp, m, TIPC_PUBLISHED, m->bc_syncpt, inputq); break; case TIPC_WITHDRAWN: if (!m) break; tipc_group_decr_active(grp, m); m->state = MBR_LEAVING; list_del_init(&m->list); tipc_group_open(m, usr_wakeup); /* Only send event if no LEAVE message can be expected */ if (!tipc_node_is_up(net, node)) tipc_group_create_event(grp, m, TIPC_WITHDRAWN, m->bc_rcv_nxt, inputq); break; default: break; } *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); } int tipc_group_fill_sock_diag(struct tipc_group *grp, struct sk_buff *skb) { struct nlattr *group = nla_nest_start_noflag(skb, TIPC_NLA_SOCK_GROUP); if (!group) return -EMSGSIZE; if (nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_ID, grp->type) || nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_INSTANCE, grp->instance) || nla_put_u32(skb, TIPC_NLA_SOCK_GROUP_BC_SEND_NEXT, grp->bc_snd_nxt)) goto group_msg_cancel; if (grp->scope == TIPC_NODE_SCOPE) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_NODE_SCOPE)) goto group_msg_cancel; if (grp->scope == TIPC_CLUSTER_SCOPE) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_CLUSTER_SCOPE)) goto group_msg_cancel; if (*grp->open) if (nla_put_flag(skb, TIPC_NLA_SOCK_GROUP_OPEN)) goto group_msg_cancel; nla_nest_end(skb, group); return 0; group_msg_cancel: nla_nest_cancel(skb, group); return -1; }
19 19 19 19 1 1 1 1 1 1 51 51 13 13 13 13 12 1 13 13 13 1 1 1 1 1 13 13 4 4 28 28 28 1 1 12 12 12 12 12 28 28 28 28 13 1 12 29 1 28 1 13 12 1 32 2 2 29 16 16 33 33 4 3 11 12 18 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 // SPDX-License-Identifier: GPL-2.0+ /* * NILFS module and super block management. * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * * Written by Ryusuke Konishi. */ /* * linux/fs/ext2/super.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/module.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/parser.h> #include <linux/crc32.h> #include <linux/vfs.h> #include <linux/writeback.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/fs_context.h> #include "nilfs.h" #include "export.h" #include "mdt.h" #include "alloc.h" #include "btree.h" #include "btnode.h" #include "page.h" #include "cpfile.h" #include "sufile.h" /* nilfs_sufile_resize(), nilfs_sufile_set_alloc_range() */ #include "ifile.h" #include "dat.h" #include "segment.h" #include "segbuf.h" MODULE_AUTHOR("NTT Corp."); MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " "(NILFS)"); MODULE_LICENSE("GPL"); static struct kmem_cache *nilfs_inode_cachep; struct kmem_cache *nilfs_transaction_cachep; struct kmem_cache *nilfs_segbuf_cachep; struct kmem_cache *nilfs_btree_path_cache; static int nilfs_setup_super(struct super_block *sb, int is_mount); static int nilfs_remount(struct super_block *sb, int *flags, char *data); void __nilfs_msg(struct super_block *sb, const char *fmt, ...) { struct va_format vaf; va_list args; int level; va_start(args, fmt); level = printk_get_level(fmt); vaf.fmt = printk_skip_level(fmt); vaf.va = &args; if (sb) printk("%c%cNILFS (%s): %pV\n", KERN_SOH_ASCII, level, sb->s_id, &vaf); else printk("%c%cNILFS: %pV\n", KERN_SOH_ASCII, level, &vaf); va_end(args); } static void nilfs_set_error(struct super_block *sb) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_super_block **sbp; down_write(&nilfs->ns_sem); if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { nilfs->ns_mount_state |= NILFS_ERROR_FS; sbp = nilfs_prepare_super(sb, 0); if (likely(sbp)) { sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS); if (sbp[1]) sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS); nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL); } } up_write(&nilfs->ns_sem); } /** * __nilfs_error() - report failure condition on a filesystem * * __nilfs_error() sets an ERROR_FS flag on the superblock as well as * reporting an error message. This function should be called when * NILFS detects incoherences or defects of meta data on disk. * * This implements the body of nilfs_error() macro. Normally, * nilfs_error() should be used. As for sustainable errors such as a * single-shot I/O error, nilfs_err() should be used instead. * * Callers should not add a trailing newline since this will do it. */ void __nilfs_error(struct super_block *sb, const char *function, const char *fmt, ...) { struct the_nilfs *nilfs = sb->s_fs_info; struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "NILFS error (device %s): %s: %pV\n", sb->s_id, function, &vaf); va_end(args); if (!sb_rdonly(sb)) { nilfs_set_error(sb); if (nilfs_test_opt(nilfs, ERRORS_RO)) { printk(KERN_CRIT "Remounting filesystem read-only\n"); sb->s_flags |= SB_RDONLY; } } if (nilfs_test_opt(nilfs, ERRORS_PANIC)) panic("NILFS (device %s): panic forced after error\n", sb->s_id); } struct inode *nilfs_alloc_inode(struct super_block *sb) { struct nilfs_inode_info *ii; ii = alloc_inode_sb(sb, nilfs_inode_cachep, GFP_NOFS); if (!ii) return NULL; ii->i_bh = NULL; ii->i_state = 0; ii->i_cno = 0; ii->i_assoc_inode = NULL; ii->i_bmap = &ii->i_bmap_data; return &ii->vfs_inode; } static void nilfs_free_inode(struct inode *inode) { if (nilfs_is_metadata_file_inode(inode)) nilfs_mdt_destroy(inode); kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); } static int nilfs_sync_super(struct super_block *sb, int flag) { struct the_nilfs *nilfs = sb->s_fs_info; int err; retry: set_buffer_dirty(nilfs->ns_sbh[0]); if (nilfs_test_opt(nilfs, BARRIER)) { err = __sync_dirty_buffer(nilfs->ns_sbh[0], REQ_SYNC | REQ_PREFLUSH | REQ_FUA); } else { err = sync_dirty_buffer(nilfs->ns_sbh[0]); } if (unlikely(err)) { nilfs_err(sb, "unable to write superblock: err=%d", err); if (err == -EIO && nilfs->ns_sbh[1]) { /* * sbp[0] points to newer log than sbp[1], * so copy sbp[0] to sbp[1] to take over sbp[0]. */ memcpy(nilfs->ns_sbp[1], nilfs->ns_sbp[0], nilfs->ns_sbsize); nilfs_fall_back_super_block(nilfs); goto retry; } } else { struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; nilfs->ns_sbwcount++; /* * The latest segment becomes trailable from the position * written in superblock. */ clear_nilfs_discontinued(nilfs); /* update GC protection for recent segments */ if (nilfs->ns_sbh[1]) { if (flag == NILFS_SB_COMMIT_ALL) { set_buffer_dirty(nilfs->ns_sbh[1]); if (sync_dirty_buffer(nilfs->ns_sbh[1]) < 0) goto out; } if (le64_to_cpu(nilfs->ns_sbp[1]->s_last_cno) < le64_to_cpu(nilfs->ns_sbp[0]->s_last_cno)) sbp = nilfs->ns_sbp[1]; } spin_lock(&nilfs->ns_last_segment_lock); nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq); spin_unlock(&nilfs->ns_last_segment_lock); } out: return err; } void nilfs_set_log_cursor(struct nilfs_super_block *sbp, struct the_nilfs *nilfs) { sector_t nfreeblocks; /* nilfs->ns_sem must be locked by the caller. */ nilfs_count_free_blocks(nilfs, &nfreeblocks); sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks); spin_lock(&nilfs->ns_last_segment_lock); sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq); sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg); sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno); spin_unlock(&nilfs->ns_last_segment_lock); } struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb, int flip) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_super_block **sbp = nilfs->ns_sbp; /* nilfs->ns_sem must be locked by the caller. */ if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) { memcpy(sbp[0], sbp[1], nilfs->ns_sbsize); } else { nilfs_crit(sb, "superblock broke"); return NULL; } } else if (sbp[1] && sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); } if (flip && sbp[1]) nilfs_swap_super_block(nilfs); return sbp; } int nilfs_commit_super(struct super_block *sb, int flag) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_super_block **sbp = nilfs->ns_sbp; time64_t t; /* nilfs->ns_sem must be locked by the caller. */ t = ktime_get_real_seconds(); nilfs->ns_sbwtime = t; sbp[0]->s_wtime = cpu_to_le64(t); sbp[0]->s_sum = 0; sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, (unsigned char *)sbp[0], nilfs->ns_sbsize)); if (flag == NILFS_SB_COMMIT_ALL && sbp[1]) { sbp[1]->s_wtime = sbp[0]->s_wtime; sbp[1]->s_sum = 0; sbp[1]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, (unsigned char *)sbp[1], nilfs->ns_sbsize)); } clear_nilfs_sb_dirty(nilfs); nilfs->ns_flushed_device = 1; /* make sure store to ns_flushed_device cannot be reordered */ smp_wmb(); return nilfs_sync_super(sb, flag); } /** * nilfs_cleanup_super() - write filesystem state for cleanup * @sb: super block instance to be unmounted or degraded to read-only * * This function restores state flags in the on-disk super block. * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the * filesystem was not clean previously. */ int nilfs_cleanup_super(struct super_block *sb) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_super_block **sbp; int flag = NILFS_SB_COMMIT; int ret = -EIO; sbp = nilfs_prepare_super(sb, 0); if (sbp) { sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); nilfs_set_log_cursor(sbp[0], nilfs); if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) { /* * make the "clean" flag also to the opposite * super block if both super blocks point to * the same checkpoint. */ sbp[1]->s_state = sbp[0]->s_state; flag = NILFS_SB_COMMIT_ALL; } ret = nilfs_commit_super(sb, flag); } return ret; } /** * nilfs_move_2nd_super - relocate secondary super block * @sb: super block instance * @sb2off: new offset of the secondary super block (in bytes) */ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) { struct the_nilfs *nilfs = sb->s_fs_info; struct buffer_head *nsbh; struct nilfs_super_block *nsbp; sector_t blocknr, newblocknr; unsigned long offset; int sb2i; /* array index of the secondary superblock */ int ret = 0; /* nilfs->ns_sem must be locked by the caller. */ if (nilfs->ns_sbh[1] && nilfs->ns_sbh[1]->b_blocknr > nilfs->ns_first_data_block) { sb2i = 1; blocknr = nilfs->ns_sbh[1]->b_blocknr; } else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) { sb2i = 0; blocknr = nilfs->ns_sbh[0]->b_blocknr; } else { sb2i = -1; blocknr = 0; } if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off) goto out; /* super block location is unchanged */ /* Get new super block buffer */ newblocknr = sb2off >> nilfs->ns_blocksize_bits; offset = sb2off & (nilfs->ns_blocksize - 1); nsbh = sb_getblk(sb, newblocknr); if (!nsbh) { nilfs_warn(sb, "unable to move secondary superblock to block %llu", (unsigned long long)newblocknr); ret = -EIO; goto out; } nsbp = (void *)nsbh->b_data + offset; lock_buffer(nsbh); if (sb2i >= 0) { /* * The position of the second superblock only changes by 4KiB, * which is larger than the maximum superblock data size * (= 1KiB), so there is no need to use memmove() to allow * overlap between source and destination. */ memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize); /* * Zero fill after copy to avoid overwriting in case of move * within the same block. */ memset(nsbh->b_data, 0, offset); memset((void *)nsbp + nilfs->ns_sbsize, 0, nsbh->b_size - offset - nilfs->ns_sbsize); } else { memset(nsbh->b_data, 0, nsbh->b_size); } set_buffer_uptodate(nsbh); unlock_buffer(nsbh); if (sb2i >= 0) { brelse(nilfs->ns_sbh[sb2i]); nilfs->ns_sbh[sb2i] = nsbh; nilfs->ns_sbp[sb2i] = nsbp; } else if (nilfs->ns_sbh[0]->b_blocknr < nilfs->ns_first_data_block) { /* secondary super block will be restored to index 1 */ nilfs->ns_sbh[1] = nsbh; nilfs->ns_sbp[1] = nsbp; } else { brelse(nsbh); } out: return ret; } /** * nilfs_resize_fs - resize the filesystem * @sb: super block instance * @newsize: new size of the filesystem (in bytes) */ int nilfs_resize_fs(struct super_block *sb, __u64 newsize) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_super_block **sbp; __u64 devsize, newnsegs; loff_t sb2off; int ret; ret = -ERANGE; devsize = bdev_nr_bytes(sb->s_bdev); if (newsize > devsize) goto out; /* * Prevent underflow in second superblock position calculation. * The exact minimum size check is done in nilfs_sufile_resize(). */ if (newsize < 4096) { ret = -ENOSPC; goto out; } /* * Write lock is required to protect some functions depending * on the number of segments, the number of reserved segments, * and so forth. */ down_write(&nilfs->ns_segctor_sem); sb2off = NILFS_SB2_OFFSET_BYTES(newsize); newnsegs = sb2off >> nilfs->ns_blocksize_bits; do_div(newnsegs, nilfs->ns_blocks_per_segment); ret = nilfs_sufile_resize(nilfs->ns_sufile, newnsegs); up_write(&nilfs->ns_segctor_sem); if (ret < 0) goto out; ret = nilfs_construct_segment(sb); if (ret < 0) goto out; down_write(&nilfs->ns_sem); nilfs_move_2nd_super(sb, sb2off); ret = -EIO; sbp = nilfs_prepare_super(sb, 0); if (likely(sbp)) { nilfs_set_log_cursor(sbp[0], nilfs); /* * Drop NILFS_RESIZE_FS flag for compatibility with * mount-time resize which may be implemented in a * future release. */ sbp[0]->s_state = cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_RESIZE_FS); sbp[0]->s_dev_size = cpu_to_le64(newsize); sbp[0]->s_nsegments = cpu_to_le64(nilfs->ns_nsegments); if (sbp[1]) memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); ret = nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL); } up_write(&nilfs->ns_sem); /* * Reset the range of allocatable segments last. This order * is important in the case of expansion because the secondary * superblock must be protected from log write until migration * completes. */ if (!ret) nilfs_sufile_set_alloc_range(nilfs->ns_sufile, 0, newnsegs - 1); out: return ret; } static void nilfs_put_super(struct super_block *sb) { struct the_nilfs *nilfs = sb->s_fs_info; nilfs_detach_log_writer(sb); if (!sb_rdonly(sb)) { down_write(&nilfs->ns_sem); nilfs_cleanup_super(sb); up_write(&nilfs->ns_sem); } nilfs_sysfs_delete_device_group(nilfs); iput(nilfs->ns_sufile); iput(nilfs->ns_cpfile); iput(nilfs->ns_dat); destroy_nilfs(nilfs); sb->s_fs_info = NULL; } static int nilfs_sync_fs(struct super_block *sb, int wait) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_super_block **sbp; int err = 0; /* This function is called when super block should be written back */ if (wait) err = nilfs_construct_segment(sb); down_write(&nilfs->ns_sem); if (nilfs_sb_dirty(nilfs)) { sbp = nilfs_prepare_super(sb, nilfs_sb_will_flip(nilfs)); if (likely(sbp)) { nilfs_set_log_cursor(sbp[0], nilfs); nilfs_commit_super(sb, NILFS_SB_COMMIT); } } up_write(&nilfs->ns_sem); if (!err) err = nilfs_flush_device(nilfs); return err; } int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt, struct nilfs_root **rootp) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_root *root; struct nilfs_checkpoint *raw_cp; struct buffer_head *bh_cp; int err = -ENOMEM; root = nilfs_find_or_create_root( nilfs, curr_mnt ? NILFS_CPTREE_CURRENT_CNO : cno); if (!root) return err; if (root->ifile) goto reuse; /* already attached checkpoint */ down_read(&nilfs->ns_segctor_sem); err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, &bh_cp); up_read(&nilfs->ns_segctor_sem); if (unlikely(err)) { if (err == -ENOENT || err == -EINVAL) { nilfs_err(sb, "Invalid checkpoint (checkpoint number=%llu)", (unsigned long long)cno); err = -EINVAL; } goto failed; } err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size, &raw_cp->cp_ifile_inode, &root->ifile); if (err) goto failed_bh; atomic64_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); atomic64_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); reuse: *rootp = root; return 0; failed_bh: nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); failed: nilfs_put_root(root); return err; } static int nilfs_freeze(struct super_block *sb) { struct the_nilfs *nilfs = sb->s_fs_info; int err; if (sb_rdonly(sb)) return 0; /* Mark super block clean */ down_write(&nilfs->ns_sem); err = nilfs_cleanup_super(sb); up_write(&nilfs->ns_sem); return err; } static int nilfs_unfreeze(struct super_block *sb) { struct the_nilfs *nilfs = sb->s_fs_info; if (sb_rdonly(sb)) return 0; down_write(&nilfs->ns_sem); nilfs_setup_super(sb, false); up_write(&nilfs->ns_sem); return 0; } static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct nilfs_root *root = NILFS_I(d_inode(dentry))->i_root; struct the_nilfs *nilfs = root->nilfs; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); unsigned long long blocks; unsigned long overhead; unsigned long nrsvblocks; sector_t nfreeblocks; u64 nmaxinodes, nfreeinodes; int err; /* * Compute all of the segment blocks * * The blocks before first segment and after last segment * are excluded. */ blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments - nilfs->ns_first_data_block; nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment; /* * Compute the overhead * * When distributing meta data blocks outside segment structure, * We must count them as the overhead. */ overhead = 0; err = nilfs_count_free_blocks(nilfs, &nfreeblocks); if (unlikely(err)) return err; err = nilfs_ifile_count_free_inodes(root->ifile, &nmaxinodes, &nfreeinodes); if (unlikely(err)) { nilfs_warn(sb, "failed to count free inodes: err=%d", err); if (err == -ERANGE) { /* * If nilfs_palloc_count_max_entries() returns * -ERANGE error code then we simply treat * curent inodes count as maximum possible and * zero as free inodes value. */ nmaxinodes = atomic64_read(&root->inodes_count); nfreeinodes = 0; err = 0; } else return err; } buf->f_type = NILFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = blocks - overhead; buf->f_bfree = nfreeblocks; buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? (buf->f_bfree - nrsvblocks) : 0; buf->f_files = nmaxinodes; buf->f_ffree = nfreeinodes; buf->f_namelen = NILFS_NAME_LEN; buf->f_fsid = u64_to_fsid(id); return 0; } static int nilfs_show_options(struct seq_file *seq, struct dentry *dentry) { struct super_block *sb = dentry->d_sb; struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_root *root = NILFS_I(d_inode(dentry))->i_root; if (!nilfs_test_opt(nilfs, BARRIER)) seq_puts(seq, ",nobarrier"); if (root->cno != NILFS_CPTREE_CURRENT_CNO) seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno); if (nilfs_test_opt(nilfs, ERRORS_PANIC)) seq_puts(seq, ",errors=panic"); if (nilfs_test_opt(nilfs, ERRORS_CONT)) seq_puts(seq, ",errors=continue"); if (nilfs_test_opt(nilfs, STRICT_ORDER)) seq_puts(seq, ",order=strict"); if (nilfs_test_opt(nilfs, NORECOVERY)) seq_puts(seq, ",norecovery"); if (nilfs_test_opt(nilfs, DISCARD)) seq_puts(seq, ",discard"); return 0; } static const struct super_operations nilfs_sops = { .alloc_inode = nilfs_alloc_inode, .free_inode = nilfs_free_inode, .dirty_inode = nilfs_dirty_inode, .evict_inode = nilfs_evict_inode, .put_super = nilfs_put_super, .sync_fs = nilfs_sync_fs, .freeze_fs = nilfs_freeze, .unfreeze_fs = nilfs_unfreeze, .statfs = nilfs_statfs, .remount_fs = nilfs_remount, .show_options = nilfs_show_options }; enum { Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, Opt_discard, Opt_nodiscard, Opt_err, }; static match_table_t tokens = { {Opt_err_cont, "errors=continue"}, {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_snapshot, "cp=%u"}, {Opt_order, "order=%s"}, {Opt_norecovery, "norecovery"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_err, NULL} }; static int parse_options(char *options, struct super_block *sb, int is_remount) { struct the_nilfs *nilfs = sb->s_fs_info; char *p; substring_t args[MAX_OPT_ARGS]; if (!options) return 1; while ((p = strsep(&options, ",")) != NULL) { int token; if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case Opt_barrier: nilfs_set_opt(nilfs, BARRIER); break; case Opt_nobarrier: nilfs_clear_opt(nilfs, BARRIER); break; case Opt_order: if (strcmp(args[0].from, "relaxed") == 0) /* Ordered data semantics */ nilfs_clear_opt(nilfs, STRICT_ORDER); else if (strcmp(args[0].from, "strict") == 0) /* Strict in-order semantics */ nilfs_set_opt(nilfs, STRICT_ORDER); else return 0; break; case Opt_err_panic: nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_PANIC); break; case Opt_err_ro: nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_RO); break; case Opt_err_cont: nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_CONT); break; case Opt_snapshot: if (is_remount) { nilfs_err(sb, "\"%s\" option is invalid for remount", p); return 0; } break; case Opt_norecovery: nilfs_set_opt(nilfs, NORECOVERY); break; case Opt_discard: nilfs_set_opt(nilfs, DISCARD); break; case Opt_nodiscard: nilfs_clear_opt(nilfs, DISCARD); break; default: nilfs_err(sb, "unrecognized mount option \"%s\"", p); return 0; } } return 1; } static inline void nilfs_set_default_options(struct super_block *sb, struct nilfs_super_block *sbp) { struct the_nilfs *nilfs = sb->s_fs_info; nilfs->ns_mount_opt = NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; } static int nilfs_setup_super(struct super_block *sb, int is_mount) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_super_block **sbp; int max_mnt_count; int mnt_count; /* nilfs->ns_sem must be locked by the caller. */ sbp = nilfs_prepare_super(sb, 0); if (!sbp) return -EIO; if (!is_mount) goto skip_mount_setup; max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count); mnt_count = le16_to_cpu(sbp[0]->s_mnt_count); if (nilfs->ns_mount_state & NILFS_ERROR_FS) { nilfs_warn(sb, "mounting fs with errors"); #if 0 } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) { nilfs_warn(sb, "maximal mount count reached"); #endif } if (!max_mnt_count) sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1); sbp[0]->s_mtime = cpu_to_le64(ktime_get_real_seconds()); skip_mount_setup: sbp[0]->s_state = cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS); /* synchronize sbp[1] with sbp[0] */ if (sbp[1]) memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); return nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL); } struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, u64 pos, int blocksize, struct buffer_head **pbh) { unsigned long long sb_index = pos; unsigned long offset; offset = do_div(sb_index, blocksize); *pbh = sb_bread(sb, sb_index); if (!*pbh) return NULL; return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset); } int nilfs_store_magic_and_option(struct super_block *sb, struct nilfs_super_block *sbp, char *data) { struct the_nilfs *nilfs = sb->s_fs_info; sb->s_magic = le16_to_cpu(sbp->s_magic); /* FS independent flags */ #ifdef NILFS_ATIME_DISABLE sb->s_flags |= SB_NOATIME; #endif nilfs_set_default_options(sb, sbp); nilfs->ns_resuid = le16_to_cpu(sbp->s_def_resuid); nilfs->ns_resgid = le16_to_cpu(sbp->s_def_resgid); nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval); nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max); return !parse_options(data, sb, 0) ? -EINVAL : 0; } int nilfs_check_feature_compatibility(struct super_block *sb, struct nilfs_super_block *sbp) { __u64 features; features = le64_to_cpu(sbp->s_feature_incompat) & ~NILFS_FEATURE_INCOMPAT_SUPP; if (features) { nilfs_err(sb, "couldn't mount because of unsupported optional features (%llx)", (unsigned long long)features); return -EINVAL; } features = le64_to_cpu(sbp->s_feature_compat_ro) & ~NILFS_FEATURE_COMPAT_RO_SUPP; if (!sb_rdonly(sb) && features) { nilfs_err(sb, "couldn't mount RDWR because of unsupported optional features (%llx)", (unsigned long long)features); return -EINVAL; } return 0; } static int nilfs_get_root_dentry(struct super_block *sb, struct nilfs_root *root, struct dentry **root_dentry) { struct inode *inode; struct dentry *dentry; int ret = 0; inode = nilfs_iget(sb, root, NILFS_ROOT_INO); if (IS_ERR(inode)) { ret = PTR_ERR(inode); nilfs_err(sb, "error %d getting root inode", ret); goto out; } if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) { iput(inode); nilfs_err(sb, "corrupt root inode"); ret = -EINVAL; goto out; } if (root->cno == NILFS_CPTREE_CURRENT_CNO) { dentry = d_find_alias(inode); if (!dentry) { dentry = d_make_root(inode); if (!dentry) { ret = -ENOMEM; goto failed_dentry; } } else { iput(inode); } } else { dentry = d_obtain_root(inode); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto failed_dentry; } } *root_dentry = dentry; out: return ret; failed_dentry: nilfs_err(sb, "error %d getting root dentry", ret); goto out; } static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, struct dentry **root_dentry) { struct the_nilfs *nilfs = s->s_fs_info; struct nilfs_root *root; int ret; mutex_lock(&nilfs->ns_snapshot_mount_mutex); down_read(&nilfs->ns_segctor_sem); ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno); up_read(&nilfs->ns_segctor_sem); if (ret < 0) { ret = (ret == -ENOENT) ? -EINVAL : ret; goto out; } else if (!ret) { nilfs_err(s, "The specified checkpoint is not a snapshot (checkpoint number=%llu)", (unsigned long long)cno); ret = -EINVAL; goto out; } ret = nilfs_attach_checkpoint(s, cno, false, &root); if (ret) { nilfs_err(s, "error %d while loading snapshot (checkpoint number=%llu)", ret, (unsigned long long)cno); goto out; } ret = nilfs_get_root_dentry(s, root, root_dentry); nilfs_put_root(root); out: mutex_unlock(&nilfs->ns_snapshot_mount_mutex); return ret; } /** * nilfs_tree_is_busy() - try to shrink dentries of a checkpoint * @root_dentry: root dentry of the tree to be shrunk * * This function returns true if the tree was in-use. */ static bool nilfs_tree_is_busy(struct dentry *root_dentry) { shrink_dcache_parent(root_dentry); return d_count(root_dentry) > 1; } int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_root *root; struct inode *inode; struct dentry *dentry; int ret; if (cno > nilfs->ns_cno) return false; if (cno >= nilfs_last_cno(nilfs)) return true; /* protect recent checkpoints */ ret = false; root = nilfs_lookup_root(nilfs, cno); if (root) { inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO); if (inode) { dentry = d_find_alias(inode); if (dentry) { ret = nilfs_tree_is_busy(dentry); dput(dentry); } iput(inode); } nilfs_put_root(root); } return ret; } /** * nilfs_fill_super() - initialize a super block instance * @sb: super_block * @data: mount options * @silent: silent mode flag * * This function is called exclusively by nilfs->ns_mount_mutex. * So, the recovery process is protected from other simultaneous mounts. */ static int nilfs_fill_super(struct super_block *sb, void *data, int silent) { struct the_nilfs *nilfs; struct nilfs_root *fsroot; __u64 cno; int err; nilfs = alloc_nilfs(sb); if (!nilfs) return -ENOMEM; sb->s_fs_info = nilfs; err = init_nilfs(nilfs, sb, (char *)data); if (err) goto failed_nilfs; sb->s_op = &nilfs_sops; sb->s_export_op = &nilfs_export_ops; sb->s_root = NULL; sb->s_time_gran = 1; sb->s_max_links = NILFS_LINK_MAX; sb->s_bdi = bdi_get(sb->s_bdev->bd_disk->bdi); err = load_nilfs(nilfs, sb); if (err) goto failed_nilfs; cno = nilfs_last_cno(nilfs); err = nilfs_attach_checkpoint(sb, cno, true, &fsroot); if (err) { nilfs_err(sb, "error %d while loading last checkpoint (checkpoint number=%llu)", err, (unsigned long long)cno); goto failed_unload; } if (!sb_rdonly(sb)) { err = nilfs_attach_log_writer(sb, fsroot); if (err) goto failed_checkpoint; } err = nilfs_get_root_dentry(sb, fsroot, &sb->s_root); if (err) goto failed_segctor; nilfs_put_root(fsroot); if (!sb_rdonly(sb)) { down_write(&nilfs->ns_sem); nilfs_setup_super(sb, true); up_write(&nilfs->ns_sem); } return 0; failed_segctor: nilfs_detach_log_writer(sb); failed_checkpoint: nilfs_put_root(fsroot); failed_unload: nilfs_sysfs_delete_device_group(nilfs); iput(nilfs->ns_sufile); iput(nilfs->ns_cpfile); iput(nilfs->ns_dat); failed_nilfs: destroy_nilfs(nilfs); return err; } static int nilfs_remount(struct super_block *sb, int *flags, char *data) { struct the_nilfs *nilfs = sb->s_fs_info; unsigned long old_sb_flags; unsigned long old_mount_opt; int err; sync_filesystem(sb); old_sb_flags = sb->s_flags; old_mount_opt = nilfs->ns_mount_opt; if (!parse_options(data, sb, 1)) { err = -EINVAL; goto restore_opts; } sb->s_flags = (sb->s_flags & ~SB_POSIXACL); err = -EINVAL; if (!nilfs_valid_fs(nilfs)) { nilfs_warn(sb, "couldn't remount because the filesystem is in an incomplete recovery state"); goto restore_opts; } if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) goto out; if (*flags & SB_RDONLY) { sb->s_flags |= SB_RDONLY; /* * Remounting a valid RW partition RDONLY, so set * the RDONLY flag and then mark the partition as valid again. */ down_write(&nilfs->ns_sem); nilfs_cleanup_super(sb); up_write(&nilfs->ns_sem); } else { __u64 features; struct nilfs_root *root; /* * Mounting a RDONLY partition read-write, so reread and * store the current valid flag. (It may have been changed * by fsck since we originally mounted the partition.) */ down_read(&nilfs->ns_sem); features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) & ~NILFS_FEATURE_COMPAT_RO_SUPP; up_read(&nilfs->ns_sem); if (features) { nilfs_warn(sb, "couldn't remount RDWR because of unsupported optional features (%llx)", (unsigned long long)features); err = -EROFS; goto restore_opts; } sb->s_flags &= ~SB_RDONLY; root = NILFS_I(d_inode(sb->s_root))->i_root; err = nilfs_attach_log_writer(sb, root); if (err) goto restore_opts; down_write(&nilfs->ns_sem); nilfs_setup_super(sb, true); up_write(&nilfs->ns_sem); } out: return 0; restore_opts: sb->s_flags = old_sb_flags; nilfs->ns_mount_opt = old_mount_opt; return err; } struct nilfs_super_data { __u64 cno; int flags; }; static int nilfs_parse_snapshot_option(const char *option, const substring_t *arg, struct nilfs_super_data *sd) { unsigned long long val; const char *msg = NULL; int err; if (!(sd->flags & SB_RDONLY)) { msg = "read-only option is not specified"; goto parse_error; } err = kstrtoull(arg->from, 0, &val); if (err) { if (err == -ERANGE) msg = "too large checkpoint number"; else msg = "malformed argument"; goto parse_error; } else if (val == 0) { msg = "invalid checkpoint number 0"; goto parse_error; } sd->cno = val; return 0; parse_error: nilfs_err(NULL, "invalid option \"%s\": %s", option, msg); return 1; } /** * nilfs_identify - pre-read mount options needed to identify mount instance * @data: mount options * @sd: nilfs_super_data */ static int nilfs_identify(char *data, struct nilfs_super_data *sd) { char *p, *options = data; substring_t args[MAX_OPT_ARGS]; int token; int ret = 0; do { p = strsep(&options, ","); if (p != NULL && *p) { token = match_token(p, tokens, args); if (token == Opt_snapshot) ret = nilfs_parse_snapshot_option(p, &args[0], sd); } if (!options) break; BUG_ON(options == data); *(options - 1) = ','; } while (!ret); return ret; } static int nilfs_set_bdev_super(struct super_block *s, void *data) { s->s_dev = *(dev_t *)data; return 0; } static int nilfs_test_bdev_super(struct super_block *s, void *data) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; } static struct dentry * nilfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { struct nilfs_super_data sd = { .flags = flags }; struct super_block *s; dev_t dev; int err; if (nilfs_identify(data, &sd)) return ERR_PTR(-EINVAL); err = lookup_bdev(dev_name, &dev); if (err) return ERR_PTR(err); s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags, &dev); if (IS_ERR(s)) return ERR_CAST(s); if (!s->s_root) { err = setup_bdev_super(s, flags, NULL); if (!err) err = nilfs_fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (err) goto failed_super; s->s_flags |= SB_ACTIVE; } else if (!sd.cno) { if (nilfs_tree_is_busy(s->s_root)) { if ((flags ^ s->s_flags) & SB_RDONLY) { nilfs_err(s, "the device already has a %s mount.", sb_rdonly(s) ? "read-only" : "read/write"); err = -EBUSY; goto failed_super; } } else { /* * Try remount to setup mount states if the current * tree is not mounted and only snapshots use this sb. */ err = nilfs_remount(s, &flags, data); if (err) goto failed_super; } } if (sd.cno) { struct dentry *root_dentry; err = nilfs_attach_snapshot(s, sd.cno, &root_dentry); if (err) goto failed_super; return root_dentry; } return dget(s->s_root); failed_super: deactivate_locked_super(s); return ERR_PTR(err); } struct file_system_type nilfs_fs_type = { .owner = THIS_MODULE, .name = "nilfs2", .mount = nilfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("nilfs2"); static void nilfs_inode_init_once(void *obj) { struct nilfs_inode_info *ii = obj; INIT_LIST_HEAD(&ii->i_dirty); #ifdef CONFIG_NILFS_XATTR init_rwsem(&ii->xattr_sem); #endif inode_init_once(&ii->vfs_inode); } static void nilfs_segbuf_init_once(void *obj) { memset(obj, 0, sizeof(struct nilfs_segment_buffer)); } static void nilfs_destroy_cachep(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(nilfs_inode_cachep); kmem_cache_destroy(nilfs_transaction_cachep); kmem_cache_destroy(nilfs_segbuf_cachep); kmem_cache_destroy(nilfs_btree_path_cache); } static int __init nilfs_init_cachep(void) { nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", sizeof(struct nilfs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, nilfs_inode_init_once); if (!nilfs_inode_cachep) goto fail; nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache", sizeof(struct nilfs_transaction_info), 0, SLAB_RECLAIM_ACCOUNT, NULL); if (!nilfs_transaction_cachep) goto fail; nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache", sizeof(struct nilfs_segment_buffer), 0, SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once); if (!nilfs_segbuf_cachep) goto fail; nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache", sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX, 0, 0, NULL); if (!nilfs_btree_path_cache) goto fail; return 0; fail: nilfs_destroy_cachep(); return -ENOMEM; } static int __init init_nilfs_fs(void) { int err; err = nilfs_init_cachep(); if (err) goto fail; err = nilfs_sysfs_init(); if (err) goto free_cachep; err = register_filesystem(&nilfs_fs_type); if (err) goto deinit_sysfs_entry; printk(KERN_INFO "NILFS version 2 loaded\n"); return 0; deinit_sysfs_entry: nilfs_sysfs_exit(); free_cachep: nilfs_destroy_cachep(); fail: return err; } static void __exit exit_nilfs_fs(void) { nilfs_destroy_cachep(); nilfs_sysfs_exit(); unregister_filesystem(&nilfs_fs_type); } module_init(init_nilfs_fs) module_exit(exit_nilfs_fs)
9 9 212 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM vmscan #if !defined(_TRACE_VMSCAN_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_VMSCAN_H #include <linux/types.h> #include <linux/tracepoint.h> #include <linux/mm.h> #include <linux/memcontrol.h> #include <trace/events/mmflags.h> #define RECLAIM_WB_ANON 0x0001u #define RECLAIM_WB_FILE 0x0002u #define RECLAIM_WB_MIXED 0x0010u #define RECLAIM_WB_SYNC 0x0004u /* Unused, all reclaim async */ #define RECLAIM_WB_ASYNC 0x0008u #define RECLAIM_WB_LRU (RECLAIM_WB_ANON|RECLAIM_WB_FILE) #define show_reclaim_flags(flags) \ (flags) ? __print_flags(flags, "|", \ {RECLAIM_WB_ANON, "RECLAIM_WB_ANON"}, \ {RECLAIM_WB_FILE, "RECLAIM_WB_FILE"}, \ {RECLAIM_WB_MIXED, "RECLAIM_WB_MIXED"}, \ {RECLAIM_WB_SYNC, "RECLAIM_WB_SYNC"}, \ {RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \ ) : "RECLAIM_WB_NONE" #define _VMSCAN_THROTTLE_WRITEBACK (1 << VMSCAN_THROTTLE_WRITEBACK) #define _VMSCAN_THROTTLE_ISOLATED (1 << VMSCAN_THROTTLE_ISOLATED) #define _VMSCAN_THROTTLE_NOPROGRESS (1 << VMSCAN_THROTTLE_NOPROGRESS) #define _VMSCAN_THROTTLE_CONGESTED (1 << VMSCAN_THROTTLE_CONGESTED) #define show_throttle_flags(flags) \ (flags) ? __print_flags(flags, "|", \ {_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"}, \ {_VMSCAN_THROTTLE_ISOLATED, "VMSCAN_THROTTLE_ISOLATED"}, \ {_VMSCAN_THROTTLE_NOPROGRESS, "VMSCAN_THROTTLE_NOPROGRESS"}, \ {_VMSCAN_THROTTLE_CONGESTED, "VMSCAN_THROTTLE_CONGESTED"} \ ) : "VMSCAN_THROTTLE_NONE" #define trace_reclaim_flags(file) ( \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ (RECLAIM_WB_ASYNC) \ ) TRACE_EVENT(mm_vmscan_kswapd_sleep, TP_PROTO(int nid), TP_ARGS(nid), TP_STRUCT__entry( __field( int, nid ) ), TP_fast_assign( __entry->nid = nid; ), TP_printk("nid=%d", __entry->nid) ); TRACE_EVENT(mm_vmscan_kswapd_wake, TP_PROTO(int nid, int zid, int order), TP_ARGS(nid, zid, order), TP_STRUCT__entry( __field( int, nid ) __field( int, zid ) __field( int, order ) ), TP_fast_assign( __entry->nid = nid; __entry->zid = zid; __entry->order = order; ), TP_printk("nid=%d order=%d", __entry->nid, __entry->order) ); TRACE_EVENT(mm_vmscan_wakeup_kswapd, TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags), TP_ARGS(nid, zid, order, gfp_flags), TP_STRUCT__entry( __field( int, nid ) __field( int, zid ) __field( int, order ) __field( unsigned long, gfp_flags ) ), TP_fast_assign( __entry->nid = nid; __entry->zid = zid; __entry->order = order; __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("nid=%d order=%d gfp_flags=%s", __entry->nid, __entry->order, show_gfp_flags(__entry->gfp_flags)) ); DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, TP_PROTO(int order, gfp_t gfp_flags), TP_ARGS(order, gfp_flags), TP_STRUCT__entry( __field( int, order ) __field( unsigned long, gfp_flags ) ), TP_fast_assign( __entry->order = order; __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("order=%d gfp_flags=%s", __entry->order, show_gfp_flags(__entry->gfp_flags)) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, TP_PROTO(int order, gfp_t gfp_flags), TP_ARGS(order, gfp_flags) ); #ifdef CONFIG_MEMCG DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin, TP_PROTO(int order, gfp_t gfp_flags), TP_ARGS(order, gfp_flags) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin, TP_PROTO(int order, gfp_t gfp_flags), TP_ARGS(order, gfp_flags) ); #endif /* CONFIG_MEMCG */ DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template, TP_PROTO(unsigned long nr_reclaimed), TP_ARGS(nr_reclaimed), TP_STRUCT__entry( __field( unsigned long, nr_reclaimed ) ), TP_fast_assign( __entry->nr_reclaimed = nr_reclaimed; ), TP_printk("nr_reclaimed=%lu", __entry->nr_reclaimed) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_direct_reclaim_end, TP_PROTO(unsigned long nr_reclaimed), TP_ARGS(nr_reclaimed) ); #ifdef CONFIG_MEMCG DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_reclaim_end, TP_PROTO(unsigned long nr_reclaimed), TP_ARGS(nr_reclaimed) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_reclaim_end, TP_PROTO(unsigned long nr_reclaimed), TP_ARGS(nr_reclaimed) ); #endif /* CONFIG_MEMCG */ TRACE_EVENT(mm_shrink_slab_start, TP_PROTO(struct shrinker *shr, struct shrink_control *sc, long nr_objects_to_shrink, unsigned long cache_items, unsigned long long delta, unsigned long total_scan, int priority), TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan, priority), TP_STRUCT__entry( __field(struct shrinker *, shr) __field(void *, shrink) __field(int, nid) __field(long, nr_objects_to_shrink) __field(unsigned long, gfp_flags) __field(unsigned long, cache_items) __field(unsigned long long, delta) __field(unsigned long, total_scan) __field(int, priority) ), TP_fast_assign( __entry->shr = shr; __entry->shrink = shr->scan_objects; __entry->nid = sc->nid; __entry->nr_objects_to_shrink = nr_objects_to_shrink; __entry->gfp_flags = (__force unsigned long)sc->gfp_mask; __entry->cache_items = cache_items; __entry->delta = delta; __entry->total_scan = total_scan; __entry->priority = priority; ), TP_printk("%pS %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", __entry->shrink, __entry->shr, __entry->nid, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), __entry->cache_items, __entry->delta, __entry->total_scan, __entry->priority) ); TRACE_EVENT(mm_shrink_slab_end, TP_PROTO(struct shrinker *shr, int nid, int shrinker_retval, long unused_scan_cnt, long new_scan_cnt, long total_scan), TP_ARGS(shr, nid, shrinker_retval, unused_scan_cnt, new_scan_cnt, total_scan), TP_STRUCT__entry( __field(struct shrinker *, shr) __field(int, nid) __field(void *, shrink) __field(long, unused_scan) __field(long, new_scan) __field(int, retval) __field(long, total_scan) ), TP_fast_assign( __entry->shr = shr; __entry->nid = nid; __entry->shrink = shr->scan_objects; __entry->unused_scan = unused_scan_cnt; __entry->new_scan = new_scan_cnt; __entry->retval = shrinker_retval; __entry->total_scan = total_scan; ), TP_printk("%pS %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", __entry->shrink, __entry->shr, __entry->nid, __entry->unused_scan, __entry->new_scan, __entry->total_scan, __entry->retval) ); TRACE_EVENT(mm_vmscan_lru_isolate, TP_PROTO(int highest_zoneidx, int order, unsigned long nr_requested, unsigned long nr_scanned, unsigned long nr_skipped, unsigned long nr_taken, int lru), TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, lru), TP_STRUCT__entry( __field(int, highest_zoneidx) __field(int, order) __field(unsigned long, nr_requested) __field(unsigned long, nr_scanned) __field(unsigned long, nr_skipped) __field(unsigned long, nr_taken) __field(int, lru) ), TP_fast_assign( __entry->highest_zoneidx = highest_zoneidx; __entry->order = order; __entry->nr_requested = nr_requested; __entry->nr_scanned = nr_scanned; __entry->nr_skipped = nr_skipped; __entry->nr_taken = nr_taken; __entry->lru = lru; ), /* * classzone is previous name of the highest_zoneidx. * Reason not to change it is the ABI requirement of the tracepoint. */ TP_printk("classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s", __entry->highest_zoneidx, __entry->order, __entry->nr_requested, __entry->nr_scanned, __entry->nr_skipped, __entry->nr_taken, __print_symbolic(__entry->lru, LRU_NAMES)) ); TRACE_EVENT(mm_vmscan_write_folio, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(unsigned long, pfn) __field(int, reclaim_flags) ), TP_fast_assign( __entry->pfn = folio_pfn(folio); __entry->reclaim_flags = trace_reclaim_flags( folio_is_file_lru(folio)); ), TP_printk("page=%p pfn=0x%lx flags=%s", pfn_to_page(__entry->pfn), __entry->pfn, show_reclaim_flags(__entry->reclaim_flags)) ); TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TP_PROTO(int nid, unsigned long nr_scanned, unsigned long nr_reclaimed, struct reclaim_stat *stat, int priority, int file), TP_ARGS(nid, nr_scanned, nr_reclaimed, stat, priority, file), TP_STRUCT__entry( __field(int, nid) __field(unsigned long, nr_scanned) __field(unsigned long, nr_reclaimed) __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) __field(unsigned long, nr_congested) __field(unsigned long, nr_immediate) __field(unsigned int, nr_activate0) __field(unsigned int, nr_activate1) __field(unsigned long, nr_ref_keep) __field(unsigned long, nr_unmap_fail) __field(int, priority) __field(int, reclaim_flags) ), TP_fast_assign( __entry->nid = nid; __entry->nr_scanned = nr_scanned; __entry->nr_reclaimed = nr_reclaimed; __entry->nr_dirty = stat->nr_dirty; __entry->nr_writeback = stat->nr_writeback; __entry->nr_congested = stat->nr_congested; __entry->nr_immediate = stat->nr_immediate; __entry->nr_activate0 = stat->nr_activate[0]; __entry->nr_activate1 = stat->nr_activate[1]; __entry->nr_ref_keep = stat->nr_ref_keep; __entry->nr_unmap_fail = stat->nr_unmap_fail; __entry->priority = priority; __entry->reclaim_flags = trace_reclaim_flags(file); ), TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s", __entry->nid, __entry->nr_scanned, __entry->nr_reclaimed, __entry->nr_dirty, __entry->nr_writeback, __entry->nr_congested, __entry->nr_immediate, __entry->nr_activate0, __entry->nr_activate1, __entry->nr_ref_keep, __entry->nr_unmap_fail, __entry->priority, show_reclaim_flags(__entry->reclaim_flags)) ); TRACE_EVENT(mm_vmscan_lru_shrink_active, TP_PROTO(int nid, unsigned long nr_taken, unsigned long nr_active, unsigned long nr_deactivated, unsigned long nr_referenced, int priority, int file), TP_ARGS(nid, nr_taken, nr_active, nr_deactivated, nr_referenced, priority, file), TP_STRUCT__entry( __field(int, nid) __field(unsigned long, nr_taken) __field(unsigned long, nr_active) __field(unsigned long, nr_deactivated) __field(unsigned long, nr_referenced) __field(int, priority) __field(int, reclaim_flags) ), TP_fast_assign( __entry->nid = nid; __entry->nr_taken = nr_taken; __entry->nr_active = nr_active; __entry->nr_deactivated = nr_deactivated; __entry->nr_referenced = nr_referenced; __entry->priority = priority; __entry->reclaim_flags = trace_reclaim_flags(file); ), TP_printk("nid=%d nr_taken=%ld nr_active=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s", __entry->nid, __entry->nr_taken, __entry->nr_active, __entry->nr_deactivated, __entry->nr_referenced, __entry->priority, show_reclaim_flags(__entry->reclaim_flags)) ); TRACE_EVENT(mm_vmscan_node_reclaim_begin, TP_PROTO(int nid, int order, gfp_t gfp_flags), TP_ARGS(nid, order, gfp_flags), TP_STRUCT__entry( __field(int, nid) __field(int, order) __field(unsigned long, gfp_flags) ), TP_fast_assign( __entry->nid = nid; __entry->order = order; __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("nid=%d order=%d gfp_flags=%s", __entry->nid, __entry->order, show_gfp_flags(__entry->gfp_flags)) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end, TP_PROTO(unsigned long nr_reclaimed), TP_ARGS(nr_reclaimed) ); TRACE_EVENT(mm_vmscan_throttled, TP_PROTO(int nid, int usec_timeout, int usec_delayed, int reason), TP_ARGS(nid, usec_timeout, usec_delayed, reason), TP_STRUCT__entry( __field(int, nid) __field(int, usec_timeout) __field(int, usec_delayed) __field(int, reason) ), TP_fast_assign( __entry->nid = nid; __entry->usec_timeout = usec_timeout; __entry->usec_delayed = usec_delayed; __entry->reason = 1U << reason; ), TP_printk("nid=%d usec_timeout=%d usect_delayed=%d reason=%s", __entry->nid, __entry->usec_timeout, __entry->usec_delayed, show_throttle_flags(__entry->reason)) ); #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
276 1 2 3 4 5 6 7 8 // SPDX-License-Identifier: GPL-2.0 #include <linux/static_call.h> long __static_call_return0(void) { return 0; } EXPORT_SYMBOL_GPL(__static_call_return0);
16 13 13 13 7 7 7 7 7 7 7 7 7 7 5 5 5 5 5 7 5 2 7 7 7 7 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 /* inflate.c -- zlib decompression * Copyright (C) 1995-2005 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h * * Based on zlib 1.2.3 but modified for the Linux Kernel by * Richard Purdie <richard@openedhand.com> * * Changes mainly for static instead of dynamic memory allocation * */ #include <linux/zutil.h> #include "inftrees.h" #include "inflate.h" #include "inffast.h" #include "infutil.h" /* architecture-specific bits */ #ifdef CONFIG_ZLIB_DFLTCC # include "../zlib_dfltcc/dfltcc_inflate.h" #else #define INFLATE_RESET_HOOK(strm) do {} while (0) #define INFLATE_TYPEDO_HOOK(strm, flush) do {} while (0) #define INFLATE_NEED_UPDATEWINDOW(strm) 1 #define INFLATE_NEED_CHECKSUM(strm) 1 #endif int zlib_inflate_workspacesize(void) { return sizeof(struct inflate_workspace); } int zlib_inflateReset(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; strm->total_in = strm->total_out = state->total = 0; strm->msg = NULL; strm->adler = 1; /* to support ill-conceived Java test suite */ state->mode = HEAD; state->last = 0; state->havedict = 0; state->dmax = 32768U; state->hold = 0; state->bits = 0; state->lencode = state->distcode = state->next = state->codes; /* Initialise Window */ state->wsize = 1U << state->wbits; state->write = 0; state->whave = 0; INFLATE_RESET_HOOK(strm); return Z_OK; } int zlib_inflateInit2(z_streamp strm, int windowBits) { struct inflate_state *state; if (strm == NULL) return Z_STREAM_ERROR; strm->msg = NULL; /* in case we return an error */ state = &WS(strm)->inflate_state; strm->state = (struct internal_state *)state; if (windowBits < 0) { state->wrap = 0; windowBits = -windowBits; } else { state->wrap = (windowBits >> 4) + 1; } if (windowBits < 8 || windowBits > 15) { return Z_STREAM_ERROR; } state->wbits = (unsigned)windowBits; #ifdef CONFIG_ZLIB_DFLTCC /* * DFLTCC requires the window to be page aligned. * Thus, we overallocate and take the aligned portion of the buffer. */ state->window = PTR_ALIGN(&WS(strm)->working_window[0], PAGE_SIZE); #else state->window = &WS(strm)->working_window[0]; #endif return zlib_inflateReset(strm); } /* Return state with length and distance decoding tables and index sizes set to fixed code decoding. This returns fixed tables from inffixed.h. */ static void zlib_fixedtables(struct inflate_state *state) { # include "inffixed.h" state->lencode = lenfix; state->lenbits = 9; state->distcode = distfix; state->distbits = 5; } /* Update the window with the last wsize (normally 32K) bytes written before returning. This is only called when a window is already in use, or when output has been written during this inflate call, but the end of the deflate stream has not been reached yet. It is also called to window dictionary data when a dictionary is loaded. Providing output buffers larger than 32K to inflate() should provide a speed advantage, since only the last 32K of output is copied to the sliding window upon return from inflate(), and since all distances after the first 32K of output will fall in the output data, making match copies simpler and faster. The advantage may be dependent on the size of the processor's data caches. */ static void zlib_updatewindow(z_streamp strm, unsigned out) { struct inflate_state *state; unsigned copy, dist; state = (struct inflate_state *)strm->state; /* copy state->wsize or less output bytes into the circular window */ copy = out - strm->avail_out; if (copy >= state->wsize) { memcpy(state->window, strm->next_out - state->wsize, state->wsize); state->write = 0; state->whave = state->wsize; } else { dist = state->wsize - state->write; if (dist > copy) dist = copy; memcpy(state->window + state->write, strm->next_out - copy, dist); copy -= dist; if (copy) { memcpy(state->window, strm->next_out - copy, copy); state->write = copy; state->whave = state->wsize; } else { state->write += dist; if (state->write == state->wsize) state->write = 0; if (state->whave < state->wsize) state->whave += dist; } } } /* * At the end of a Deflate-compressed PPP packet, we expect to have seen * a `stored' block type value but not the (zero) length bytes. */ /* Returns true if inflate is currently at the end of a block generated by Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP implementation to provide an additional safety check. PPP uses Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored block. When decompressing, PPP checks that at the end of input packet, inflate is waiting for these length bytes. */ static int zlib_inflateSyncPacket(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == STORED && state->bits == 0) { state->mode = TYPE; return Z_OK; } return Z_DATA_ERROR; } /* Macros for inflate(): */ /* check function to use adler32() for zlib or crc32() for gzip */ #define UPDATE(check, buf, len) zlib_adler32(check, buf, len) /* Load registers with state in inflate() for speed */ #define LOAD() \ do { \ put = strm->next_out; \ left = strm->avail_out; \ next = strm->next_in; \ have = strm->avail_in; \ hold = state->hold; \ bits = state->bits; \ } while (0) /* Restore state from registers in inflate() */ #define RESTORE() \ do { \ strm->next_out = put; \ strm->avail_out = left; \ strm->next_in = next; \ strm->avail_in = have; \ state->hold = hold; \ state->bits = bits; \ } while (0) /* Clear the input bit accumulator */ #define INITBITS() \ do { \ hold = 0; \ bits = 0; \ } while (0) /* Get a byte of input into the bit accumulator, or return from inflate() if there is no input available. */ #define PULLBYTE() \ do { \ if (have == 0) goto inf_leave; \ have--; \ hold += (unsigned long)(*next++) << bits; \ bits += 8; \ } while (0) /* Assure that there are at least n bits in the bit accumulator. If there is not enough available input to do that, then return from inflate(). */ #define NEEDBITS(n) \ do { \ while (bits < (unsigned)(n)) \ PULLBYTE(); \ } while (0) /* Return the low n bits of the bit accumulator (n < 16) */ #define BITS(n) \ ((unsigned)hold & ((1U << (n)) - 1)) /* Remove n bits from the bit accumulator */ #define DROPBITS(n) \ do { \ hold >>= (n); \ bits -= (unsigned)(n); \ } while (0) /* Remove zero to seven bits as needed to go to a byte boundary */ #define BYTEBITS() \ do { \ hold >>= bits & 7; \ bits -= bits & 7; \ } while (0) /* inflate() uses a state machine to process as much input data and generate as much output data as possible before returning. The state machine is structured roughly as follows: for (;;) switch (state) { ... case STATEn: if (not enough input data or output space to make progress) return; ... make progress ... state = STATEm; break; ... } so when inflate() is called again, the same case is attempted again, and if the appropriate resources are provided, the machine proceeds to the next state. The NEEDBITS() macro is usually the way the state evaluates whether it can proceed or should return. NEEDBITS() does the return if the requested bits are not available. The typical use of the BITS macros is: NEEDBITS(n); ... do something with BITS(n) ... DROPBITS(n); where NEEDBITS(n) either returns from inflate() if there isn't enough input left to load n bits into the accumulator, or it continues. BITS(n) gives the low n bits in the accumulator. When done, DROPBITS(n) drops the low n bits off the accumulator. INITBITS() clears the accumulator and sets the number of available bits to zero. BYTEBITS() discards just enough bits to put the accumulator on a byte boundary. After BYTEBITS() and a NEEDBITS(8), then BITS(8) would return the next byte in the stream. NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return if there is no input available. The decoding of variable length codes uses PULLBYTE() directly in order to pull just enough bytes to decode the next code, and no more. Some states loop until they get enough input, making sure that enough state information is maintained to continue the loop where it left off if NEEDBITS() returns in the loop. For example, want, need, and keep would all have to actually be part of the saved state in case NEEDBITS() returns: case STATEw: while (want < need) { NEEDBITS(n); keep[want++] = BITS(n); DROPBITS(n); } state = STATEx; case STATEx: As shown above, if the next state is also the next case, then the break is omitted. A state may also return if there is not enough output space available to complete that state. Those states are copying stored data, writing a literal byte, and copying a matching string. When returning, a "goto inf_leave" is used to update the total counters, update the check value, and determine whether any progress has been made during that inflate() call in order to return the proper return code. Progress is defined as a change in either strm->avail_in or strm->avail_out. When there is a window, goto inf_leave will update the window with the last output written. If a goto inf_leave occurs in the middle of decompression and there is no window currently, goto inf_leave will create one and copy output to the window for the next call of inflate(). In this implementation, the flush parameter of inflate() only affects the return code (per zlib.h). inflate() always writes as much as possible to strm->next_out, given the space available and the provided input--the effect documented in zlib.h of Z_SYNC_FLUSH. Furthermore, inflate() always defers the allocation of and copying into a sliding window until necessary, which provides the effect documented in zlib.h for Z_FINISH when the entire input stream available. So the only thing the flush parameter actually does is: when flush is set to Z_FINISH, inflate() cannot return Z_OK. Instead it will return Z_BUF_ERROR if it has not reached the end of the stream. */ int zlib_inflate(z_streamp strm, int flush) { struct inflate_state *state; const unsigned char *next; /* next input */ unsigned char *put; /* next output */ unsigned have, left; /* available input and output */ unsigned long hold; /* bit buffer */ unsigned bits; /* bits in bit buffer */ unsigned in, out; /* save starting available input and output */ unsigned copy; /* number of stored or match bytes to copy */ unsigned char *from; /* where to copy match bytes from */ code this; /* current decoding table entry */ code last; /* parent table entry */ unsigned len; /* length to copy for repeats, bits to drop */ int ret; /* return code */ static const unsigned short order[19] = /* permutation of code lengths */ {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; /* Do not check for strm->next_out == NULL here as ppc zImage inflates to strm->next_out = 0 */ if (strm == NULL || strm->state == NULL || (strm->next_in == NULL && strm->avail_in != 0)) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == TYPE) state->mode = TYPEDO; /* skip check */ LOAD(); in = have; out = left; ret = Z_OK; for (;;) switch (state->mode) { case HEAD: if (state->wrap == 0) { state->mode = TYPEDO; break; } NEEDBITS(16); if ( ((BITS(8) << 8) + (hold >> 8)) % 31) { strm->msg = (char *)"incorrect header check"; state->mode = BAD; break; } if (BITS(4) != Z_DEFLATED) { strm->msg = (char *)"unknown compression method"; state->mode = BAD; break; } DROPBITS(4); len = BITS(4) + 8; if (len > state->wbits) { strm->msg = (char *)"invalid window size"; state->mode = BAD; break; } state->dmax = 1U << len; strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = hold & 0x200 ? DICTID : TYPE; INITBITS(); break; case DICTID: NEEDBITS(32); strm->adler = state->check = REVERSE(hold); INITBITS(); state->mode = DICT; fallthrough; case DICT: if (state->havedict == 0) { RESTORE(); return Z_NEED_DICT; } strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = TYPE; fallthrough; case TYPE: if (flush == Z_BLOCK) goto inf_leave; fallthrough; case TYPEDO: INFLATE_TYPEDO_HOOK(strm, flush); if (state->last) { BYTEBITS(); state->mode = CHECK; break; } NEEDBITS(3); state->last = BITS(1); DROPBITS(1); switch (BITS(2)) { case 0: /* stored block */ state->mode = STORED; break; case 1: /* fixed block */ zlib_fixedtables(state); state->mode = LEN; /* decode codes */ break; case 2: /* dynamic block */ state->mode = TABLE; break; case 3: strm->msg = (char *)"invalid block type"; state->mode = BAD; } DROPBITS(2); break; case STORED: BYTEBITS(); /* go to byte boundary */ NEEDBITS(32); if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) { strm->msg = (char *)"invalid stored block lengths"; state->mode = BAD; break; } state->length = (unsigned)hold & 0xffff; INITBITS(); state->mode = COPY; fallthrough; case COPY: copy = state->length; if (copy) { if (copy > have) copy = have; if (copy > left) copy = left; if (copy == 0) goto inf_leave; memcpy(put, next, copy); have -= copy; next += copy; left -= copy; put += copy; state->length -= copy; break; } state->mode = TYPE; break; case TABLE: NEEDBITS(14); state->nlen = BITS(5) + 257; DROPBITS(5); state->ndist = BITS(5) + 1; DROPBITS(5); state->ncode = BITS(4) + 4; DROPBITS(4); #ifndef PKZIP_BUG_WORKAROUND if (state->nlen > 286 || state->ndist > 30) { strm->msg = (char *)"too many length or distance symbols"; state->mode = BAD; break; } #endif state->have = 0; state->mode = LENLENS; fallthrough; case LENLENS: while (state->have < state->ncode) { NEEDBITS(3); state->lens[order[state->have++]] = (unsigned short)BITS(3); DROPBITS(3); } while (state->have < 19) state->lens[order[state->have++]] = 0; state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 7; ret = zlib_inflate_table(CODES, state->lens, 19, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid code lengths set"; state->mode = BAD; break; } state->have = 0; state->mode = CODELENS; fallthrough; case CODELENS: while (state->have < state->nlen + state->ndist) { for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.val < 16) { NEEDBITS(this.bits); DROPBITS(this.bits); state->lens[state->have++] = this.val; } else { if (this.val == 16) { NEEDBITS(this.bits + 2); DROPBITS(this.bits); if (state->have == 0) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } len = state->lens[state->have - 1]; copy = 3 + BITS(2); DROPBITS(2); } else if (this.val == 17) { NEEDBITS(this.bits + 3); DROPBITS(this.bits); len = 0; copy = 3 + BITS(3); DROPBITS(3); } else { NEEDBITS(this.bits + 7); DROPBITS(this.bits); len = 0; copy = 11 + BITS(7); DROPBITS(7); } if (state->have + copy > state->nlen + state->ndist) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } while (copy--) state->lens[state->have++] = (unsigned short)len; } } /* handle error breaks in while */ if (state->mode == BAD) break; /* build code tables */ state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 9; ret = zlib_inflate_table(LENS, state->lens, state->nlen, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid literal/lengths set"; state->mode = BAD; break; } state->distcode = (code const *)(state->next); state->distbits = 6; ret = zlib_inflate_table(DISTS, state->lens + state->nlen, state->ndist, &(state->next), &(state->distbits), state->work); if (ret) { strm->msg = (char *)"invalid distances set"; state->mode = BAD; break; } state->mode = LEN; fallthrough; case LEN: if (have >= 6 && left >= 258) { RESTORE(); inflate_fast(strm, out); LOAD(); break; } for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.op && (this.op & 0xf0) == 0) { last = this; for (;;) { this = state->lencode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); state->length = (unsigned)this.val; if ((int)(this.op) == 0) { state->mode = LIT; break; } if (this.op & 32) { state->mode = TYPE; break; } if (this.op & 64) { strm->msg = (char *)"invalid literal/length code"; state->mode = BAD; break; } state->extra = (unsigned)(this.op) & 15; state->mode = LENEXT; fallthrough; case LENEXT: if (state->extra) { NEEDBITS(state->extra); state->length += BITS(state->extra); DROPBITS(state->extra); } state->mode = DIST; fallthrough; case DIST: for (;;) { this = state->distcode[BITS(state->distbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if ((this.op & 0xf0) == 0) { last = this; for (;;) { this = state->distcode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); if (this.op & 64) { strm->msg = (char *)"invalid distance code"; state->mode = BAD; break; } state->offset = (unsigned)this.val; state->extra = (unsigned)(this.op) & 15; state->mode = DISTEXT; fallthrough; case DISTEXT: if (state->extra) { NEEDBITS(state->extra); state->offset += BITS(state->extra); DROPBITS(state->extra); } #ifdef INFLATE_STRICT if (state->offset > state->dmax) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } #endif if (state->offset > state->whave + out - left) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } state->mode = MATCH; fallthrough; case MATCH: if (left == 0) goto inf_leave; copy = out - left; if (state->offset > copy) { /* copy from window */ copy = state->offset - copy; if (copy > state->write) { copy -= state->write; from = state->window + (state->wsize - copy); } else from = state->window + (state->write - copy); if (copy > state->length) copy = state->length; } else { /* copy from output */ from = put - state->offset; copy = state->length; } if (copy > left) copy = left; left -= copy; state->length -= copy; do { *put++ = *from++; } while (--copy); if (state->length == 0) state->mode = LEN; break; case LIT: if (left == 0) goto inf_leave; *put++ = (unsigned char)(state->length); left--; state->mode = LEN; break; case CHECK: if (state->wrap) { NEEDBITS(32); out -= left; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && out) strm->adler = state->check = UPDATE(state->check, put - out, out); out = left; if (( REVERSE(hold)) != state->check) { strm->msg = (char *)"incorrect data check"; state->mode = BAD; break; } INITBITS(); } state->mode = DONE; fallthrough; case DONE: ret = Z_STREAM_END; goto inf_leave; case BAD: ret = Z_DATA_ERROR; goto inf_leave; case MEM: return Z_MEM_ERROR; case SYNC: default: return Z_STREAM_ERROR; } /* Return from inflate(), updating the total counts and the check value. If there was no progress during the inflate() call, return a buffer error. Call zlib_updatewindow() to create and/or update the window state. */ inf_leave: RESTORE(); if (INFLATE_NEED_UPDATEWINDOW(strm) && (state->wsize || (state->mode < CHECK && out != strm->avail_out))) zlib_updatewindow(strm, out); in -= strm->avail_in; out -= strm->avail_out; strm->total_in += in; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && state->wrap && out) strm->adler = state->check = UPDATE(state->check, strm->next_out - out, out); strm->data_type = state->bits + (state->last ? 64 : 0) + (state->mode == TYPE ? 128 : 0); if (flush == Z_PACKET_FLUSH && ret == Z_OK && strm->avail_out != 0 && strm->avail_in == 0) return zlib_inflateSyncPacket(strm); if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK) ret = Z_BUF_ERROR; return ret; } int zlib_inflateEnd(z_streamp strm) { if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; return Z_OK; } /* * This subroutine adds the data at next_in/avail_in to the output history * without performing any output. The output buffer must be "caught up"; * i.e. no pending output but this should always be the case. The state must * be waiting on the start of a block (i.e. mode == TYPE or HEAD). On exit, * the output will also be caught up, and the checksum will have been updated * if need be. */ int zlib_inflateIncomp(z_stream *z) { struct inflate_state *state = (struct inflate_state *)z->state; Byte *saved_no = z->next_out; uInt saved_ao = z->avail_out; if (state->mode != TYPE && state->mode != HEAD) return Z_DATA_ERROR; /* Setup some variables to allow misuse of updateWindow */ z->avail_out = 0; z->next_out = (unsigned char*)z->next_in + z->avail_in; zlib_updatewindow(z, z->avail_in); /* Restore saved variables */ z->avail_out = saved_ao; z->next_out = saved_no; z->adler = state->check = UPDATE(state->check, z->next_in, z->avail_in); z->total_out += z->avail_in; z->total_in += z->avail_in; z->next_in += z->avail_in; state->total += z->avail_in; z->avail_in = 0; return Z_OK; }
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. */ #ifndef BTRFS_TRANSACTION_H #define BTRFS_TRANSACTION_H #include <linux/refcount.h> #include "btrfs_inode.h" #include "delayed-ref.h" #include "ctree.h" #include "misc.h" /* Radix-tree tag for roots that are part of the trasaction. */ #define BTRFS_ROOT_TRANS_TAG 0 enum btrfs_trans_state { TRANS_STATE_RUNNING, TRANS_STATE_COMMIT_PREP, TRANS_STATE_COMMIT_START, TRANS_STATE_COMMIT_DOING, TRANS_STATE_UNBLOCKED, TRANS_STATE_SUPER_COMMITTED, TRANS_STATE_COMPLETED, TRANS_STATE_MAX, }; #define BTRFS_TRANS_HAVE_FREE_BGS 0 #define BTRFS_TRANS_DIRTY_BG_RUN 1 #define BTRFS_TRANS_CACHE_ENOSPC 2 struct btrfs_transaction { u64 transid; /* * total external writers(USERSPACE/START/ATTACH) in this * transaction, it must be zero before the transaction is * being committed */ atomic_t num_extwriters; /* * total writers in this transaction, it must be zero before the * transaction can end */ atomic_t num_writers; refcount_t use_count; unsigned long flags; /* Be protected by fs_info->trans_lock when we want to change it. */ enum btrfs_trans_state state; int aborted; struct list_head list; struct extent_io_tree dirty_pages; time64_t start_time; wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; struct list_head pending_snapshots; struct list_head dev_update_list; struct list_head switch_commits; struct list_head dirty_bgs; /* * There is no explicit lock which protects io_bgs, rather its * consistency is implied by the fact that all the sites which modify * it do so under some form of transaction critical section, namely: * * - btrfs_start_dirty_block_groups - This function can only ever be * run by one of the transaction committers. Refer to * BTRFS_TRANS_DIRTY_BG_RUN usage in btrfs_commit_transaction * * - btrfs_write_dirty_blockgroups - this is called by * commit_cowonly_roots from transaction critical section * (TRANS_STATE_COMMIT_DOING) * * - btrfs_cleanup_dirty_bgs - called on transaction abort */ struct list_head io_bgs; struct list_head dropped_roots; struct extent_io_tree pinned_extents; /* * we need to make sure block group deletion doesn't race with * free space cache writeout. This mutex keeps them from stomping * on each other */ struct mutex cache_write_mutex; spinlock_t dirty_bgs_lock; /* Protected by spin lock fs_info->unused_bgs_lock. */ struct list_head deleted_bgs; spinlock_t dropped_roots_lock; struct btrfs_delayed_ref_root delayed_refs; struct btrfs_fs_info *fs_info; /* * Number of ordered extents the transaction must wait for before * committing. These are ordered extents started by a fast fsync. */ atomic_t pending_ordered; wait_queue_head_t pending_wait; }; enum { ENUM_BIT(__TRANS_FREEZABLE), ENUM_BIT(__TRANS_START), ENUM_BIT(__TRANS_ATTACH), ENUM_BIT(__TRANS_JOIN), ENUM_BIT(__TRANS_JOIN_NOLOCK), ENUM_BIT(__TRANS_DUMMY), ENUM_BIT(__TRANS_JOIN_NOSTART), }; #define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) #define TRANS_ATTACH (__TRANS_ATTACH) #define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE) #define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK) #define TRANS_JOIN_NOSTART (__TRANS_JOIN_NOSTART) #define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; u64 delayed_refs_bytes_reserved; u64 chunk_bytes_reserved; unsigned long delayed_ref_updates; unsigned long delayed_ref_csum_deletions; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; /* Set by a task that wants to create a snapshot. */ struct btrfs_pending_snapshot *pending_snapshot; refcount_t use_count; unsigned int type; /* * Error code of transaction abort, set outside of locks and must use * the READ_ONCE/WRITE_ONCE access */ short aborted; bool adding_csums; bool allocating_chunk; bool removing_chunk; bool reloc_reserved; bool in_fsync; struct btrfs_fs_info *fs_info; struct list_head new_bgs; struct btrfs_block_rsv delayed_rsv; }; /* * The abort status can be changed between calls and is not protected by locks. * This accepts btrfs_transaction and btrfs_trans_handle as types. Once it's * set to a non-zero value it does not change, so the macro should be in checks * but is not necessary for further reads of the value. */ #define TRANS_ABORTED(trans) (unlikely(READ_ONCE((trans)->aborted))) struct btrfs_pending_snapshot { struct dentry *dentry; struct inode *dir; struct btrfs_root *root; struct btrfs_root_item *root_item; struct btrfs_root *snap; struct btrfs_qgroup_inherit *inherit; struct btrfs_path *path; /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; /* extra metadata reservation for relocation */ int error; /* Preallocated anonymous block device number */ dev_t anon_dev; bool readonly; struct list_head list; }; static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { spin_lock(&inode->lock); inode->last_trans = trans->transaction->transid; inode->last_sub_trans = btrfs_get_root_log_transid(inode->root); inode->last_log_commit = inode->last_sub_trans - 1; spin_unlock(&inode->lock); } /* * Make qgroup codes to skip given qgroupid, means the old/new_roots for * qgroup won't contain the qgroupid in it. */ static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) { struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; WARN_ON(delayed_refs->qgroup_to_skip); delayed_refs->qgroup_to_skip = qgroupid; } static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) { struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; WARN_ON(!delayed_refs->qgroup_to_skip); delayed_refs->qgroup_to_skip = 0; } bool __cold abort_should_print_stack(int error); /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact stack trace is reported for some errors. */ #define btrfs_abort_transaction(trans, error) \ do { \ bool first = false; \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ first = true; \ if (WARN(abort_should_print_stack(error), \ KERN_ERR \ "BTRFS: Transaction aborted (error %d)\n", \ (error))) { \ /* Stack trace printed. */ \ } else { \ btrfs_err((trans)->fs_info, \ "Transaction aborted (error %d)", \ (error)); \ } \ } \ __btrfs_abort_transaction((trans), __func__, \ __LINE__, (error), first); \ } while (0) int btrfs_end_transaction(struct btrfs_trans_handle *trans); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, unsigned int num_items); struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( struct btrfs_root *root, unsigned int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction_barrier( struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid); void btrfs_add_dead_root(struct btrfs_root *root); void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark); int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, unsigned int line, int error, bool first_hit); int __init btrfs_transaction_init(void); void __cold btrfs_transaction_exit(void); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,net type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Range as input support for IPv4 added */ /* 3 nomatch flag support added */ /* 4 Counters support added */ /* 5 Comments support added */ /* 6 Forceadd support added */ /* 7 skbinfo support added */ #define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,net"); /* Type specific function prefix */ #define HTYPE hash_ipportnet /* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 * However this way we have to store internally cidr - 1, * dancing back and forth. */ #define IP_SET_HASH_WITH_NETS_PACKED #define IP_SET_HASH_WITH_PROTO #define IP_SET_HASH_WITH_NETS /* IPv4 variant */ /* Member elements */ struct hash_ipportnet4_elem { __be32 ip; __be32 ip2; __be16 port; u8 cidr:7; u8 nomatch:1; u8 proto; }; /* Common functions */ static bool hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, const struct hash_ipportnet4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->ip2 == ip2->ip2 && ip1->cidr == ip2->cidr && ip1->port == ip2->port && ip1->proto == ip2->proto; } static int hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static void hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) { elem->ip2 &= ip_set_netmask(cidr); elem->cidr = cidr - 1; } static bool hash_ipportnet4_data_list(struct sk_buff *skb, const struct hash_ipportnet4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, const struct hash_ipportnet4_elem *d) { next->ip = d->ip; next->port = d->port; next->ip2 = d->ip2; } #define MTYPE hash_ipportnet4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ipportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (adt == IPSET_TEST) e.cidr = HOST_MASK - 1; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); e.ip2 &= ip_set_netmask(e.cidr + 1); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ipportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; u32 ip2_from = 0, ip2_to = 0, ip2, i = 0; bool with_ports = false; u8 cidr; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); if (ret) return ret; if (tb[IPSET_ATTR_CIDR2]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; e.cidr = cidr - 1; } e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_IP_TO] || with_ports || tb[IPSET_ATTR_IP2_TO])) { e.ip = htonl(ip); e.ip2 = htonl(ip2_from & ip_set_hostmask(e.cidr + 1)); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) swap(ip, ip_to); } else if (tb[IPSET_ATTR_CIDR]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } port_to = port = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); } ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); if (ret) return ret; if (ip2_from > ip2_to) swap(ip2_from, ip2_to); if (ip2_from + UINT_MAX == ip2_to) return -IPSET_ERR_HASH_RANGE; } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1); } if (retried) { ip = ntohl(h->next.ip); p = ntohs(h->next.port); ip2 = ntohl(h->next.ip2); } else { p = port; ip2 = ip2_from; } for (; ip <= ip_to; ip++) { e.ip = htonl(ip); for (; p <= port_to; p++) { e.port = htons(p); do { i++; e.ip2 = htonl(ip2); ip2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr); e.cidr = cidr - 1; if (i > IPSET_MAX_RANGE) { hash_ipportnet4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } while (ip2++ < ip2_to); ip2 = ip2_from; } p = port; } return ret; } /* IPv6 variant */ struct hash_ipportnet6_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 cidr:7; u8 nomatch:1; u8 proto; }; /* Common functions */ static bool hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, const struct hash_ipportnet6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && ip1->cidr == ip2->cidr && ip1->port == ip2->port && ip1->proto == ip2->proto; } static int hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static void hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip2, cidr); elem->cidr = cidr - 1; } static bool hash_ipportnet6_data_list(struct sk_buff *skb, const struct hash_ipportnet6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportnet6_data_next(struct hash_ipportnet6_elem *next, const struct hash_ipportnet6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipportnet6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ipportnet6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet6_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (adt == IPSET_TEST) e.cidr = HOST_MASK - 1; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); ip6_netmask(&e.ip2, e.cidr + 1); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipportnet6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; u8 cidr; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; if (tb[IPSET_ATTR_CIDR2]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; e.cidr = cidr - 1; } ip6_netmask(&e.ip2, e.cidr + 1); e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_ipportnet_type __read_mostly = { .name = "hash:ip,port,net", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 | IPSET_TYPE_NOMATCH, .dimension = IPSET_DIM_THREE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipportnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipportnet_init(void) { return ip_set_type_register(&hash_ipportnet_type); } static void __exit hash_ipportnet_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipportnet_type); } module_init(hash_ipportnet_init); module_exit(hash_ipportnet_fini);
96 96 93 3 3 93 13 93 93 78 7 24 24 18 23 2 21 2 14 19 21 17 4 17 4 17 23 1 8 8 3 9 13 13 12 8 8 8 2 6 33 33 3 8 8 8 8 13 20 13 1 12 4 4 13 13 13 10 2 5 7 7 18 1 11 5 4 9 10 8 5 13 33 33 1 18 1 15 13 4 14 3 12 12 12 4 8 13 2 1 1 9 4 7 9 72 69 72 1 69 1 1 1 5 1 1 3 3 1 1 5 1 4 4 5 5 5 1 1 79 13 79 79 79 13 13 8 9 8 2 1 1 1 1 9 9 9 9 9 9 9 9 9 9 13 14 14 10 69 1 68 69 68 12 1 1 54 12 4 66 63 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 /* * hugetlbpage-backed filesystem. Based on ramfs. * * Nadia Yvette Chambers, 2002 * * Copyright (C) 2002 Linus Torvalds. * License: GPL */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/thread_info.h> #include <asm/current.h> #include <linux/falloc.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/file.h> #include <linux/kernel.h> #include <linux/writeback.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/string.h> #include <linux/capability.h> #include <linux/ctype.h> #include <linux/backing-dev.h> #include <linux/hugetlb.h> #include <linux/pagevec.h> #include <linux/fs_parser.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/dnotify.h> #include <linux/statfs.h> #include <linux/security.h> #include <linux/magic.h> #include <linux/migrate.h> #include <linux/uio.h> #include <linux/uaccess.h> #include <linux/sched/mm.h> static const struct address_space_operations hugetlbfs_aops; const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT }; struct hugetlbfs_fs_context { struct hstate *hstate; unsigned long long max_size_opt; unsigned long long min_size_opt; long max_hpages; long nr_inodes; long min_hpages; enum hugetlbfs_size_type max_val_type; enum hugetlbfs_size_type min_val_type; kuid_t uid; kgid_t gid; umode_t mode; }; int sysctl_hugetlb_shm_group; enum hugetlb_param { Opt_gid, Opt_min_size, Opt_mode, Opt_nr_inodes, Opt_pagesize, Opt_size, Opt_uid, }; static const struct fs_parameter_spec hugetlb_fs_parameters[] = { fsparam_u32 ("gid", Opt_gid), fsparam_string("min_size", Opt_min_size), fsparam_u32oct("mode", Opt_mode), fsparam_string("nr_inodes", Opt_nr_inodes), fsparam_string("pagesize", Opt_pagesize), fsparam_string("size", Opt_size), fsparam_u32 ("uid", Opt_uid), {} }; /* * Mask used when checking the page offset value passed in via system * calls. This value will be converted to a loff_t which is signed. * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the * value. The extra bit (- 1 in the shift value) is to take the sign * bit into account. */ #define PGOFF_LOFFT_MAX \ (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); vm_flags_t vm_flags; /* * vma address alignment (but not the pgoff alignment) has * already been checked by prepare_hugepage_range. If you add * any error returns here, do so after setting VM_HUGETLB, so * is_vm_hugetlb_page tests below unmap_region go the right * way when do_mmap unwinds (may be important on powerpc * and ia64). */ vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); vma->vm_ops = &hugetlb_vm_ops; ret = seal_check_write(info->seals, vma); if (ret) return ret; /* * page based offset in vm_pgoff could be sufficiently large to * overflow a loff_t when converted to byte offset. This can * only happen on architectures where sizeof(loff_t) == * sizeof(unsigned long). So, only check in those instances. */ if (sizeof(unsigned long) == sizeof(loff_t)) { if (vma->vm_pgoff & PGOFF_LOFFT_MAX) return -EINVAL; } /* must be huge page aligned */ if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; vma_len = (loff_t)(vma->vm_end - vma->vm_start); len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* check for overflow */ if (len < vma_len) return -EINVAL; inode_lock(inode); file_accessed(file); ret = -ENOMEM; vm_flags = vma->vm_flags; /* * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip * reserving here. Note: only for SHM hugetlbfs file, the inode * flag S_PRIVATE is set. */ if (inode->i_flags & S_PRIVATE) vm_flags |= VM_NORESERVE; if (!hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), len >> huge_page_shift(h), vma, vm_flags)) goto out; ret = 0; if (vma->vm_flags & VM_WRITE && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); return ret; } /* * Called under mmap_write_lock(mm). */ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; info.flags = 0; info.length = len; info.low_limit = current->mm->mmap_base; info.high_limit = arch_get_mmap_end(addr, len, flags); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); } static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ if (unlikely(offset_in_page(addr))) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = current->mm->mmap_base; info.high_limit = arch_get_mmap_end(addr, len, flags); addr = vm_unmapped_area(&info); } return addr; } unsigned long generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); if (len & ~huge_page_mask(h)) return -EINVAL; if (len > TASK_SIZE) return -ENOMEM; if (flags & MAP_FIXED) { if (prepare_hugepage_range(file, addr, len)) return -EINVAL; return addr; } if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (mmap_end - len >= addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } /* * Use mm->get_unmapped_area value as a hint to use topdown routine. * If architectures have special needs, they should define their own * version of hugetlb_get_unmapped_area. */ if (mm->get_unmapped_area == arch_get_unmapped_area_topdown) return hugetlb_get_unmapped_area_topdown(file, addr, len, pgoff, flags); return hugetlb_get_unmapped_area_bottomup(file, addr, len, pgoff, flags); } #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA static unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); } #endif /* * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset. * Returns the maximum number of bytes one can read without touching the 1st raw * HWPOISON subpage. * * The implementation borrows the iteration logic from copy_page_to_iter*. */ static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes) { size_t n = 0; size_t res = 0; /* First subpage to start the loop. */ page = nth_page(page, offset / PAGE_SIZE); offset %= PAGE_SIZE; while (1) { if (is_raw_hwpoison_page_in_hugepage(page)) break; /* Safe to read n bytes without touching HWPOISON subpage. */ n = min(bytes, (size_t)PAGE_SIZE - offset); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page = nth_page(page, 1); offset = 0; } } return res; } /* * Support for read() - Find the page attached to f_mapping and copy out the * data. This provides functionality similar to filemap_read(). */ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct hstate *h = hstate_file(file); struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; unsigned long index = iocb->ki_pos >> huge_page_shift(h); unsigned long offset = iocb->ki_pos & ~huge_page_mask(h); unsigned long end_index; loff_t isize; ssize_t retval = 0; while (iov_iter_count(to)) { struct folio *folio; size_t nr, copied, want; /* nr is the maximum number of bytes to copy from this page */ nr = huge_page_size(h); isize = i_size_read(inode); if (!isize) break; end_index = (isize - 1) >> huge_page_shift(h); if (index > end_index) break; if (index == end_index) { nr = ((isize - 1) & ~huge_page_mask(h)) + 1; if (nr <= offset) break; } nr = nr - offset; /* Find the folio */ folio = filemap_lock_hugetlb_folio(h, mapping, index); if (IS_ERR(folio)) { /* * We have a HOLE, zero out the user-buffer for the * length of the hole or request. */ copied = iov_iter_zero(nr, to); } else { folio_unlock(folio); if (!folio_test_hwpoison(folio)) want = nr; else { /* * Adjust how many bytes safe to read without * touching the 1st raw HWPOISON subpage after * offset. */ want = adjust_range_hwpoison(&folio->page, offset, nr); if (want == 0) { folio_put(folio); retval = -EIO; break; } } /* * We have the folio, copy it to user space buffer. */ copied = copy_folio_to_iter(folio, offset, want, to); folio_put(folio); } offset += copied; retval += copied; if (copied != nr && iov_iter_count(to)) { if (!retval) retval = -EFAULT; break; } index += offset >> huge_page_shift(h); offset &= ~huge_page_mask(h); } iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset; return retval; } static int hugetlbfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { return -EINVAL; } static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { BUG(); return -EINVAL; } static void hugetlb_delete_from_page_cache(struct folio *folio) { folio_clear_dirty(folio); folio_clear_uptodate(folio); filemap_remove_folio(folio); } /* * Called with i_mmap_rwsem held for inode based vma maps. This makes * sure vma (and vm_mm) will not go away. We also hold the hugetlb fault * mutex for the page in the mapping. So, we can not race with page being * faulted into the vma. */ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { pte_t *ptep, pte; ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma))); if (!ptep) return false; pte = huge_ptep_get(ptep); if (huge_pte_none(pte) || !pte_present(pte)) return false; if (pte_page(pte) == page) return true; return false; } /* * Can vma_offset_start/vma_offset_end overflow on 32-bit arches? * No, because the interval tree returns us only those vmas * which overlap the truncated area starting at pgoff, * and no vma on a 32-bit arch can span beyond the 4GB. */ static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start) { unsigned long offset = 0; if (vma->vm_pgoff < start) offset = (start - vma->vm_pgoff) << PAGE_SHIFT; return vma->vm_start + offset; } static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end) { unsigned long t_end; if (!end) return vma->vm_end; t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start; if (t_end > vma->vm_end) t_end = vma->vm_end; return t_end; } /* * Called with hugetlb fault mutex held. Therefore, no more mappings to * this folio can be created while executing the routine. */ static void hugetlb_unmap_file_folio(struct hstate *h, struct address_space *mapping, struct folio *folio, pgoff_t index) { struct rb_root_cached *root = &mapping->i_mmap; struct hugetlb_vma_lock *vma_lock; struct page *page = &folio->page; struct vm_area_struct *vma; unsigned long v_start; unsigned long v_end; pgoff_t start, end; start = index * pages_per_huge_page(h); end = (index + 1) * pages_per_huge_page(h); i_mmap_lock_write(mapping); retry: vma_lock = NULL; vma_interval_tree_foreach(vma, root, start, end - 1) { v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); if (!hugetlb_vma_maps_page(vma, v_start, page)) continue; if (!hugetlb_vma_trylock_write(vma)) { vma_lock = vma->vm_private_data; /* * If we can not get vma lock, we need to drop * immap_sema and take locks in order. First, * take a ref on the vma_lock structure so that * we can be guaranteed it will not go away when * dropping immap_sema. */ kref_get(&vma_lock->refs); break; } unmap_hugepage_range(vma, v_start, v_end, NULL, ZAP_FLAG_DROP_MARKER); hugetlb_vma_unlock_write(vma); } i_mmap_unlock_write(mapping); if (vma_lock) { /* * Wait on vma_lock. We know it is still valid as we have * a reference. We must 'open code' vma locking as we do * not know if vma_lock is still attached to vma. */ down_write(&vma_lock->rw_sema); i_mmap_lock_write(mapping); vma = vma_lock->vma; if (!vma) { /* * If lock is no longer attached to vma, then just * unlock, drop our reference and retry looking for * other vmas. */ up_write(&vma_lock->rw_sema); kref_put(&vma_lock->refs, hugetlb_vma_lock_release); goto retry; } /* * vma_lock is still attached to vma. Check to see if vma * still maps page and if so, unmap. */ v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); if (hugetlb_vma_maps_page(vma, v_start, page)) unmap_hugepage_range(vma, v_start, v_end, NULL, ZAP_FLAG_DROP_MARKER); kref_put(&vma_lock->refs, hugetlb_vma_lock_release); hugetlb_vma_unlock_write(vma); goto retry; } } static void hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, zap_flags_t zap_flags) { struct vm_area_struct *vma; /* * end == 0 indicates that the entire range after start should be * unmapped. Note, end is exclusive, whereas the interval tree takes * an inclusive "last". */ vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { unsigned long v_start; unsigned long v_end; if (!hugetlb_vma_trylock_write(vma)) continue; v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); unmap_hugepage_range(vma, v_start, v_end, NULL, zap_flags); /* * Note that vma lock only exists for shared/non-private * vmas. Therefore, lock is not held when calling * unmap_hugepage_range for private vmas. */ hugetlb_vma_unlock_write(vma); } } /* * Called with hugetlb fault mutex held. * Returns true if page was actually removed, false otherwise. */ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, struct address_space *mapping, struct folio *folio, pgoff_t index, bool truncate_op) { bool ret = false; /* * If folio is mapped, it was faulted in after being * unmapped in caller. Unmap (again) while holding * the fault mutex. The mutex will prevent faults * until we finish removing the folio. */ if (unlikely(folio_mapped(folio))) hugetlb_unmap_file_folio(h, mapping, folio, index); folio_lock(folio); /* * We must remove the folio from page cache before removing * the region/ reserve map (hugetlb_unreserve_pages). In * rare out of memory conditions, removal of the region/reserve * map could fail. Correspondingly, the subpool and global * reserve usage count can need to be adjusted. */ VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio); hugetlb_delete_from_page_cache(folio); ret = true; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1))) hugetlb_fix_reserve_counts(inode); } folio_unlock(folio); return ret; } /* * remove_inode_hugepages handles two distinct cases: truncation and hole * punch. There are subtle differences in operation for each case. * * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve * maps and global counts. Page faults can race with truncation. * During faults, hugetlb_no_page() checks i_size before page allocation, * and again after obtaining page table lock. It will 'back out' * allocations in the truncated range. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserve map * deleted. The region/reserve map for ranges without associated * pages are not modified. Page faults can race with hole punch. * This is indicated if we find a mapped page. * Note: If the passed end of range value is beyond the end of file, but * not LLONG_MAX this routine still performs a hole punch operation. */ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, loff_t lend) { struct hstate *h = hstate_inode(inode); struct address_space *mapping = &inode->i_data; const pgoff_t end = lend >> PAGE_SHIFT; struct folio_batch fbatch; pgoff_t next, index; int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); folio_batch_init(&fbatch); next = lstart >> PAGE_SHIFT; while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) { for (i = 0; i < folio_batch_count(&fbatch); ++i) { struct folio *folio = fbatch.folios[i]; u32 hash = 0; index = folio->index >> huge_page_order(h); hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* * Remove folio that was part of folio_batch. */ if (remove_inode_single_folio(h, inode, mapping, folio, index, truncate_op)) freed++; mutex_unlock(&hugetlb_fault_mutex_table[hash]); } folio_batch_release(&fbatch); cond_resched(); } if (truncate_op) (void)hugetlb_unreserve_pages(inode, lstart >> huge_page_shift(h), LONG_MAX, freed); } static void hugetlbfs_evict_inode(struct inode *inode) { struct resv_map *resv_map; remove_inode_hugepages(inode, 0, LLONG_MAX); /* * Get the resv_map from the address space embedded in the inode. * This is the address space which points to any resv_map allocated * at inode creation time. If this is a device special inode, * i_mapping may not point to the original address space. */ resv_map = (struct resv_map *)(&inode->i_data)->i_private_data; /* Only regular and link inodes have associated reserve maps */ if (resv_map) resv_map_release(&resv_map->refs); clear_inode(inode); } static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) { pgoff_t pgoff; struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); BUG_ON(offset & ~huge_page_mask(h)); pgoff = offset >> PAGE_SHIFT; i_size_write(inode, offset); i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0, ZAP_FLAG_DROP_MARKER); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, offset, LLONG_MAX); } static void hugetlbfs_zero_partial_page(struct hstate *h, struct address_space *mapping, loff_t start, loff_t end) { pgoff_t idx = start >> huge_page_shift(h); struct folio *folio; folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) return; start = start & ~huge_page_mask(h); end = end & ~huge_page_mask(h); if (!end) end = huge_page_size(h); folio_zero_segment(folio, (size_t)start, (size_t)end); folio_unlock(folio); folio_put(folio); } static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); loff_t hpage_size = huge_page_size(h); loff_t hole_start, hole_end; /* * hole_start and hole_end indicate the full pages within the hole. */ hole_start = round_up(offset, hpage_size); hole_end = round_down(offset + len, hpage_size); inode_lock(inode); /* protected by i_rwsem */ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { inode_unlock(inode); return -EPERM; } i_mmap_lock_write(mapping); /* If range starts before first full page, zero partial page. */ if (offset < hole_start) hugetlbfs_zero_partial_page(h, mapping, offset, min(offset + len, hole_start)); /* Unmap users of full pages in the hole. */ if (hole_end > hole_start) { if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, hole_start >> PAGE_SHIFT, hole_end >> PAGE_SHIFT, 0); } /* If range extends beyond last full page, zero partial page. */ if ((offset + len) > hole_end && (offset + len) > hole_start) hugetlbfs_zero_partial_page(h, mapping, hole_end, offset + len); i_mmap_unlock_write(mapping); /* Remove full pages from the file. */ if (hole_end > hole_start) remove_inode_hugepages(inode, hole_start, hole_end); inode_unlock(inode); return 0; } static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); struct vm_area_struct pseudo_vma; struct mm_struct *mm = current->mm; loff_t hpage_size = huge_page_size(h); unsigned long hpage_shift = huge_page_shift(h); pgoff_t start, index, end; int error; u32 hash; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) return hugetlbfs_punch_hole(inode, offset, len); /* * Default preallocate case. * For this range, start is rounded down and end is rounded up * as well as being converted to page offsets. */ start = offset >> hpage_shift; end = (offset + len + hpage_size - 1) >> hpage_shift; inode_lock(inode); /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ error = inode_newsize_ok(inode, offset + len); if (error) goto out; if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { error = -EPERM; goto out; } /* * Initialize a pseudo vma as this is required by the huge page * allocation routines. */ vma_init(&pseudo_vma, mm); vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_file = file; for (index = start; index < end; index++) { /* * This is supposed to be the vaddr where the page is being * faulted in, but we have no vaddr here. */ struct folio *folio; unsigned long addr; cond_resched(); /* * fallocate(2) manpage permits EINTR; we may have been * interrupted because we are using up too much memory. */ if (signal_pending(current)) { error = -EINTR; break; } /* addr is the offset within the file (zero based) */ addr = index * hpage_size; /* mutex taken here, fault path and hole punch */ hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ folio = filemap_get_folio(mapping, index << huge_page_order(h)); if (!IS_ERR(folio)) { folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); continue; } /* * Allocate folio without setting the avoid_reserve argument. * There certainly are no reserves associated with the * pseudo_vma. However, there could be shared mappings with * reserves for the file at the inode level. If we fallocate * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); error = PTR_ERR(folio); goto out; } clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); __folio_mark_uptodate(folio); error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) { restore_reserve_on_error(h, &pseudo_vma, addr, folio); folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; } mutex_unlock(&hugetlb_fault_mutex_table[hash]); folio_set_hugetlb_migratable(folio); /* * folio_unlock because locked by hugetlb_add_to_page_cache() * folio_put() due to reference from alloc_hugetlb_folio() */ folio_unlock(folio); folio_put(folio); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); inode_set_ctime_current(inode); out: inode_unlock(inode); return error; } static int hugetlbfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct hstate *h = hstate_inode(inode); int error; unsigned int ia_valid = attr->ia_valid; struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; if (ia_valid & ATTR_SIZE) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; if (newsize & ~huge_page_mask(h)) return -EINVAL; /* protected by i_rwsem */ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || (newsize > oldsize && (info->seals & F_SEAL_GROW))) return -EPERM; hugetlb_vmtruncate(inode, newsize); } setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } static struct inode *hugetlbfs_get_root(struct super_block *sb, struct hugetlbfs_fs_context *ctx) { struct inode *inode; inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = S_IFDIR | ctx->mode; inode->i_uid = ctx->uid; inode->i_gid = ctx->gid; simple_inode_init_ts(inode); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); lockdep_annotate_inode_mutex_key(inode); } return inode; } /* * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never * be taken from reclaim -- unlike regular filesystems. This needs an * annotation because huge_pmd_share() does an allocation under hugetlb's * i_mmap_rwsem. */ static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev) { struct inode *inode; struct resv_map *resv_map = NULL; /* * Reserve maps are only needed for inodes that can have associated * page allocations. */ if (S_ISREG(mode) || S_ISLNK(mode)) { resv_map = resv_map_alloc(); if (!resv_map) return NULL; } inode = new_inode(sb); if (inode) { struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); inode->i_ino = get_next_ino(); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; simple_inode_init_ts(inode); inode->i_mapping->i_private_data = resv_map; info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); break; case S_IFREG: inode->i_op = &hugetlbfs_inode_operations; inode->i_fop = &hugetlbfs_file_operations; break; case S_IFDIR: inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; inode_nohighmem(inode); break; } lockdep_annotate_inode_mutex_key(inode); } else { if (resv_map) kref_put(&resv_map->refs, resv_map_release); } return inode; } /* * File creation. Allocate an inode, and we're done.. */ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); if (!inode) return -ENOSPC; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); d_instantiate(dentry, inode); dget(dentry);/* Extra count - pin the dentry in core */ return 0; } static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int retval = hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); return retval; } static int hugetlbfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); } static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0); if (!inode) return -ENOSPC; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); d_tmpfile(file, inode); return finish_open_simple(file, 0); } static int hugetlbfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct inode *inode; int error = -ENOSPC; inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); if (inode) { int l = strlen(symname)+1; error = page_symlink(inode, symname, l); if (!error) { d_instantiate(dentry, inode); dget(dentry); } else iput(inode); } inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); return error; } #ifdef CONFIG_MIGRATION static int hugetlbfs_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { int rc; rc = migrate_huge_page_move_mapping(mapping, dst, src); if (rc != MIGRATEPAGE_SUCCESS) return rc; if (hugetlb_folio_subpool(src)) { hugetlb_set_folio_subpool(dst, hugetlb_folio_subpool(src)); hugetlb_set_folio_subpool(src, NULL); } if (mode != MIGRATE_SYNC_NO_COPY) folio_migrate_copy(dst, src); else folio_migrate_flags(dst, src); return MIGRATEPAGE_SUCCESS; } #else #define hugetlbfs_migrate_folio NULL #endif static int hugetlbfs_error_remove_folio(struct address_space *mapping, struct folio *folio) { return 0; } /* * Display the mount options in /proc/mounts. */ static int hugetlbfs_show_options(struct seq_file *m, struct dentry *root) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(root->d_sb); struct hugepage_subpool *spool = sbinfo->spool; unsigned long hpage_size = huge_page_size(sbinfo->hstate); unsigned hpage_shift = huge_page_shift(sbinfo->hstate); char mod; if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, sbinfo->uid)); if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); if (sbinfo->mode != 0755) seq_printf(m, ",mode=%o", sbinfo->mode); if (sbinfo->max_inodes != -1) seq_printf(m, ",nr_inodes=%lu", sbinfo->max_inodes); hpage_size /= 1024; mod = 'K'; if (hpage_size >= 1024) { hpage_size /= 1024; mod = 'M'; } seq_printf(m, ",pagesize=%lu%c", hpage_size, mod); if (spool) { if (spool->max_hpages != -1) seq_printf(m, ",size=%llu", (unsigned long long)spool->max_hpages << hpage_shift); if (spool->min_hpages != -1) seq_printf(m, ",min_size=%llu", (unsigned long long)spool->min_hpages << hpage_shift); } return 0; } static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); struct hstate *h = hstate_inode(d_inode(dentry)); u64 id = huge_encode_dev(dentry->d_sb->s_dev); buf->f_fsid = u64_to_fsid(id); buf->f_type = HUGETLBFS_MAGIC; buf->f_bsize = huge_page_size(h); if (sbinfo) { spin_lock(&sbinfo->stat_lock); /* If no limits set, just report 0 or -1 for max/free/used * blocks, like simple_statfs() */ if (sbinfo->spool) { long free_pages; spin_lock_irq(&sbinfo->spool->lock); buf->f_blocks = sbinfo->spool->max_hpages; free_pages = sbinfo->spool->max_hpages - sbinfo->spool->used_hpages; buf->f_bavail = buf->f_bfree = free_pages; spin_unlock_irq(&sbinfo->spool->lock); buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_inodes; } spin_unlock(&sbinfo->stat_lock); } buf->f_namelen = NAME_MAX; return 0; } static void hugetlbfs_put_super(struct super_block *sb) { struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb); if (sbi) { sb->s_fs_info = NULL; if (sbi->spool) hugepage_put_subpool(sbi->spool); kfree(sbi); } } static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) { if (sbinfo->free_inodes >= 0) { spin_lock(&sbinfo->stat_lock); if (unlikely(!sbinfo->free_inodes)) { spin_unlock(&sbinfo->stat_lock); return 0; } sbinfo->free_inodes--; spin_unlock(&sbinfo->stat_lock); } return 1; } static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) { if (sbinfo->free_inodes >= 0) { spin_lock(&sbinfo->stat_lock); sbinfo->free_inodes++; spin_unlock(&sbinfo->stat_lock); } } static struct kmem_cache *hugetlbfs_inode_cachep; static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); struct hugetlbfs_inode_info *p; if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) return NULL; p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL); if (unlikely(!p)) { hugetlbfs_inc_free_inodes(sbinfo); return NULL; } return &p->vfs_inode; } static void hugetlbfs_free_inode(struct inode *inode) { kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); } static const struct address_space_operations hugetlbfs_aops = { .write_begin = hugetlbfs_write_begin, .write_end = hugetlbfs_write_end, .dirty_folio = noop_dirty_folio, .migrate_folio = hugetlbfs_migrate_folio, .error_remove_folio = hugetlbfs_error_remove_folio, }; static void init_once(void *foo) { struct hugetlbfs_inode_info *ei = foo; inode_init_once(&ei->vfs_inode); } const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, .fallocate = hugetlbfs_fallocate, }; static const struct inode_operations hugetlbfs_dir_inode_operations = { .create = hugetlbfs_create, .lookup = simple_lookup, .link = simple_link, .unlink = simple_unlink, .symlink = hugetlbfs_symlink, .mkdir = hugetlbfs_mkdir, .rmdir = simple_rmdir, .mknod = hugetlbfs_mknod, .rename = simple_rename, .setattr = hugetlbfs_setattr, .tmpfile = hugetlbfs_tmpfile, }; static const struct inode_operations hugetlbfs_inode_operations = { .setattr = hugetlbfs_setattr, }; static const struct super_operations hugetlbfs_ops = { .alloc_inode = hugetlbfs_alloc_inode, .free_inode = hugetlbfs_free_inode, .destroy_inode = hugetlbfs_destroy_inode, .evict_inode = hugetlbfs_evict_inode, .statfs = hugetlbfs_statfs, .put_super = hugetlbfs_put_super, .show_options = hugetlbfs_show_options, }; /* * Convert size option passed from command line to number of huge pages * in the pool specified by hstate. Size option could be in bytes * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT). */ static long hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, enum hugetlbfs_size_type val_type) { if (val_type == NO_SIZE) return -1; if (val_type == SIZE_PERCENT) { size_opt <<= huge_page_shift(h); size_opt *= h->max_huge_pages; do_div(size_opt, 100); } size_opt >>= huge_page_shift(h); return size_opt; } /* * Parse one mount parameter. */ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct hugetlbfs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; struct hstate *h; char *rest; unsigned long ps; int opt; opt = fs_parse(fc, hugetlb_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_uid: ctx->uid = make_kuid(current_user_ns(), result.uint_32); if (!uid_valid(ctx->uid)) goto bad_val; return 0; case Opt_gid: ctx->gid = make_kgid(current_user_ns(), result.uint_32); if (!gid_valid(ctx->gid)) goto bad_val; return 0; case Opt_mode: ctx->mode = result.uint_32 & 01777U; return 0; case Opt_size: /* memparse() will accept a K/M/G without a digit */ if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->max_size_opt = memparse(param->string, &rest); ctx->max_val_type = SIZE_STD; if (*rest == '%') ctx->max_val_type = SIZE_PERCENT; return 0; case Opt_nr_inodes: /* memparse() will accept a K/M/G without a digit */ if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->nr_inodes = memparse(param->string, &rest); return 0; case Opt_pagesize: ps = memparse(param->string, &rest); h = size_to_hstate(ps); if (!h) { pr_err("Unsupported page size %lu MB\n", ps / SZ_1M); return -EINVAL; } ctx->hstate = h; return 0; case Opt_min_size: /* memparse() will accept a K/M/G without a digit */ if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->min_size_opt = memparse(param->string, &rest); ctx->min_val_type = SIZE_STD; if (*rest == '%') ctx->min_val_type = SIZE_PERCENT; return 0; default: return -EINVAL; } bad_val: return invalfc(fc, "Bad value '%s' for mount option '%s'\n", param->string, param->key); } /* * Validate the parsed options. */ static int hugetlbfs_validate(struct fs_context *fc) { struct hugetlbfs_fs_context *ctx = fc->fs_private; /* * Use huge page pool size (in hstate) to convert the size * options to number of huge pages. If NO_SIZE, -1 is returned. */ ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate, ctx->max_size_opt, ctx->max_val_type); ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate, ctx->min_size_opt, ctx->min_val_type); /* * If max_size was specified, then min_size must be smaller */ if (ctx->max_val_type > NO_SIZE && ctx->min_hpages > ctx->max_hpages) { pr_err("Minimum size can not be greater than maximum size\n"); return -EINVAL; } return 0; } static int hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct hugetlbfs_fs_context *ctx = fc->fs_private; struct hugetlbfs_sb_info *sbinfo; sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); if (!sbinfo) return -ENOMEM; sb->s_fs_info = sbinfo; spin_lock_init(&sbinfo->stat_lock); sbinfo->hstate = ctx->hstate; sbinfo->max_inodes = ctx->nr_inodes; sbinfo->free_inodes = ctx->nr_inodes; sbinfo->spool = NULL; sbinfo->uid = ctx->uid; sbinfo->gid = ctx->gid; sbinfo->mode = ctx->mode; /* * Allocate and initialize subpool if maximum or minimum size is * specified. Any needed reservations (for minimum size) are taken * when the subpool is created. */ if (ctx->max_hpages != -1 || ctx->min_hpages != -1) { sbinfo->spool = hugepage_new_subpool(ctx->hstate, ctx->max_hpages, ctx->min_hpages); if (!sbinfo->spool) goto out_free; } sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = huge_page_size(ctx->hstate); sb->s_blocksize_bits = huge_page_shift(ctx->hstate); sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; sb->s_time_gran = 1; /* * Due to the special and limited functionality of hugetlbfs, it does * not work well as a stacking filesystem. */ sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx)); if (!sb->s_root) goto out_free; return 0; out_free: kfree(sbinfo->spool); kfree(sbinfo); return -ENOMEM; } static int hugetlbfs_get_tree(struct fs_context *fc) { int err = hugetlbfs_validate(fc); if (err) return err; return get_tree_nodev(fc, hugetlbfs_fill_super); } static void hugetlbfs_fs_context_free(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations hugetlbfs_fs_context_ops = { .free = hugetlbfs_fs_context_free, .parse_param = hugetlbfs_parse_param, .get_tree = hugetlbfs_get_tree, }; static int hugetlbfs_init_fs_context(struct fs_context *fc) { struct hugetlbfs_fs_context *ctx; ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->max_hpages = -1; /* No limit on size by default */ ctx->nr_inodes = -1; /* No limit on number of inodes by default */ ctx->uid = current_fsuid(); ctx->gid = current_fsgid(); ctx->mode = 0755; ctx->hstate = &default_hstate; ctx->min_hpages = -1; /* No default minimum size */ ctx->max_val_type = NO_SIZE; ctx->min_val_type = NO_SIZE; fc->fs_private = ctx; fc->ops = &hugetlbfs_fs_context_ops; return 0; } static struct file_system_type hugetlbfs_fs_type = { .name = "hugetlbfs", .init_fs_context = hugetlbfs_init_fs_context, .parameters = hugetlb_fs_parameters, .kill_sb = kill_litter_super, }; static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; static int can_do_hugetlb_shm(void) { kgid_t shm_group; shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group); return capable(CAP_IPC_LOCK) || in_group_p(shm_group); } static int get_hstate_idx(int page_size_log) { struct hstate *h = hstate_sizelog(page_size_log); if (!h) return -1; return hstate_index(h); } /* * Note that size should be aligned to proper hugepage size in caller side, * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. */ struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, int creat_flags, int page_size_log) { struct inode *inode; struct vfsmount *mnt; int hstate_idx; struct file *file; hstate_idx = get_hstate_idx(page_size_log); if (hstate_idx < 0) return ERR_PTR(-ENODEV); mnt = hugetlbfs_vfsmount[hstate_idx]; if (!mnt) return ERR_PTR(-ENOENT); if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { struct ucounts *ucounts = current_ucounts(); if (user_shm_lock(size, ucounts)) { pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n", current->comm, current->pid); user_shm_unlock(size, ucounts); } return ERR_PTR(-EPERM); } file = ERR_PTR(-ENOSPC); inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0); if (!inode) goto out; if (creat_flags == HUGETLB_SHMFS_INODE) inode->i_flags |= S_PRIVATE; inode->i_size = size; clear_nlink(inode); if (!hugetlb_reserve_pages(inode, 0, size >> huge_page_shift(hstate_inode(inode)), NULL, acctflag)) file = ERR_PTR(-ENOMEM); else file = alloc_file_pseudo(inode, mnt, name, O_RDWR, &hugetlbfs_file_operations); if (!IS_ERR(file)) return file; iput(inode); out: return file; } static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) { struct fs_context *fc; struct vfsmount *mnt; fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT); if (IS_ERR(fc)) { mnt = ERR_CAST(fc); } else { struct hugetlbfs_fs_context *ctx = fc->fs_private; ctx->hstate = h; mnt = fc_mount(fc); put_fs_context(fc); } if (IS_ERR(mnt)) pr_err("Cannot mount internal hugetlbfs for page size %luK", huge_page_size(h) / SZ_1K); return mnt; } static int __init init_hugetlbfs_fs(void) { struct vfsmount *mnt; struct hstate *h; int error; int i; if (!hugepages_supported()) { pr_info("disabling because there are no supported hugepage sizes\n"); return -ENOTSUPP; } error = -ENOMEM; hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", sizeof(struct hugetlbfs_inode_info), 0, SLAB_ACCOUNT, init_once); if (hugetlbfs_inode_cachep == NULL) goto out; error = register_filesystem(&hugetlbfs_fs_type); if (error) goto out_free; /* default hstate mount is required */ mnt = mount_one_hugetlbfs(&default_hstate); if (IS_ERR(mnt)) { error = PTR_ERR(mnt); goto out_unreg; } hugetlbfs_vfsmount[default_hstate_idx] = mnt; /* other hstates are optional */ i = 0; for_each_hstate(h) { if (i == default_hstate_idx) { i++; continue; } mnt = mount_one_hugetlbfs(h); if (IS_ERR(mnt)) hugetlbfs_vfsmount[i] = NULL; else hugetlbfs_vfsmount[i] = mnt; i++; } return 0; out_unreg: (void)unregister_filesystem(&hugetlbfs_fs_type); out_free: kmem_cache_destroy(hugetlbfs_inode_cachep); out: return error; } fs_initcall(init_hugetlbfs_fs)
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 // SPDX-License-Identifier: GPL-2.0-or-later /* * DSA tagging protocol handling * * Copyright (c) 2008-2009 Marvell Semiconductor * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org> * Copyright (c) 2016 Andrew Lunn <andrew@lunn.ch> */ #include <linux/netdevice.h> #include <linux/ptp_classify.h> #include <linux/skbuff.h> #include <net/dsa.h> #include <net/dst_metadata.h> #include "tag.h" #include "user.h" static LIST_HEAD(dsa_tag_drivers_list); static DEFINE_MUTEX(dsa_tag_drivers_lock); /* Determine if we should defer delivery of skb until we have a rx timestamp. * * Called from dsa_switch_rcv. For now, this will only work if tagging is * enabled on the switch. Normally the MAC driver would retrieve the hardware * timestamp when it reads the packet out of the hardware. However in a DSA * switch, the DSA driver owning the interface to which the packet is * delivered is never notified unless we do so here. */ static bool dsa_skb_defer_rx_timestamp(struct dsa_user_priv *p, struct sk_buff *skb) { struct dsa_switch *ds = p->dp->ds; unsigned int type; if (!ds->ops->port_rxtstamp) return false; if (skb_headroom(skb) < ETH_HLEN) return false; __skb_push(skb, ETH_HLEN); type = ptp_classify_raw(skb); __skb_pull(skb, ETH_HLEN); if (type == PTP_CLASS_NONE) return false; return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type); } static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *unused) { struct metadata_dst *md_dst = skb_metadata_dst(skb); struct dsa_port *cpu_dp = dev->dsa_ptr; struct sk_buff *nskb = NULL; struct dsa_user_priv *p; if (unlikely(!cpu_dp)) { kfree_skb(skb); return 0; } skb = skb_unshare(skb, GFP_ATOMIC); if (!skb) return 0; if (md_dst && md_dst->type == METADATA_HW_PORT_MUX) { unsigned int port = md_dst->u.port_info.port_id; skb_dst_drop(skb); if (!skb_has_extensions(skb)) skb->slow_gro = 0; skb->dev = dsa_conduit_find_user(dev, 0, port); if (likely(skb->dev)) { dsa_default_offload_fwd_mark(skb); nskb = skb; } } else { nskb = cpu_dp->rcv(skb, dev); } if (!nskb) { kfree_skb(skb); return 0; } skb = nskb; skb_push(skb, ETH_HLEN); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); if (unlikely(!dsa_user_dev_check(skb->dev))) { /* Packet is to be injected directly on an upper * device, e.g. a team/bond, so skip all DSA-port * specific actions. */ netif_rx(skb); return 0; } p = netdev_priv(skb->dev); if (unlikely(cpu_dp->ds->untag_bridge_pvid)) { nskb = dsa_untag_bridge_pvid(skb); if (!nskb) { kfree_skb(skb); return 0; } skb = nskb; } dev_sw_netstats_rx_add(skb->dev, skb->len + ETH_HLEN); if (dsa_skb_defer_rx_timestamp(p, skb)) return 0; gro_cells_receive(&p->gcells, skb); return 0; } struct packet_type dsa_pack_type __read_mostly = { .type = cpu_to_be16(ETH_P_XDSA), .func = dsa_switch_rcv, }; static void dsa_tag_driver_register(struct dsa_tag_driver *dsa_tag_driver, struct module *owner) { dsa_tag_driver->owner = owner; mutex_lock(&dsa_tag_drivers_lock); list_add_tail(&dsa_tag_driver->list, &dsa_tag_drivers_list); mutex_unlock(&dsa_tag_drivers_lock); } void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], unsigned int count, struct module *owner) { unsigned int i; for (i = 0; i < count; i++) dsa_tag_driver_register(dsa_tag_driver_array[i], owner); } static void dsa_tag_driver_unregister(struct dsa_tag_driver *dsa_tag_driver) { mutex_lock(&dsa_tag_drivers_lock); list_del(&dsa_tag_driver->list); mutex_unlock(&dsa_tag_drivers_lock); } EXPORT_SYMBOL_GPL(dsa_tag_drivers_register); void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], unsigned int count) { unsigned int i; for (i = 0; i < count; i++) dsa_tag_driver_unregister(dsa_tag_driver_array[i]); } EXPORT_SYMBOL_GPL(dsa_tag_drivers_unregister); const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops) { return ops->name; }; /* Function takes a reference on the module owning the tagger, * so dsa_tag_driver_put must be called afterwards. */ const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name) { const struct dsa_device_ops *ops = ERR_PTR(-ENOPROTOOPT); struct dsa_tag_driver *dsa_tag_driver; request_module("%s%s", DSA_TAG_DRIVER_ALIAS, name); mutex_lock(&dsa_tag_drivers_lock); list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { const struct dsa_device_ops *tmp = dsa_tag_driver->ops; if (strcmp(name, tmp->name)) continue; if (!try_module_get(dsa_tag_driver->owner)) break; ops = tmp; break; } mutex_unlock(&dsa_tag_drivers_lock); return ops; } const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol) { struct dsa_tag_driver *dsa_tag_driver; const struct dsa_device_ops *ops; bool found = false; request_module("%sid-%d", DSA_TAG_DRIVER_ALIAS, tag_protocol); mutex_lock(&dsa_tag_drivers_lock); list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { ops = dsa_tag_driver->ops; if (ops->proto == tag_protocol) { found = true; break; } } if (found) { if (!try_module_get(dsa_tag_driver->owner)) ops = ERR_PTR(-ENOPROTOOPT); } else { ops = ERR_PTR(-ENOPROTOOPT); } mutex_unlock(&dsa_tag_drivers_lock); return ops; } void dsa_tag_driver_put(const struct dsa_device_ops *ops) { struct dsa_tag_driver *dsa_tag_driver; mutex_lock(&dsa_tag_drivers_lock); list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { if (dsa_tag_driver->ops == ops) { module_put(dsa_tag_driver->owner); break; } } mutex_unlock(&dsa_tag_drivers_lock); }
104 130 130 130 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 // SPDX-License-Identifier: GPL-2.0-only /* * kvm asynchronous fault support * * Copyright 2010 Red Hat, Inc. * * Author: * Gleb Natapov <gleb@redhat.com> */ #include <linux/kvm_host.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/mmu_context.h> #include <linux/sched/mm.h> #include "async_pf.h" #include <trace/events/kvm.h> static struct kmem_cache *async_pf_cache; int kvm_async_pf_init(void) { async_pf_cache = KMEM_CACHE(kvm_async_pf, 0); if (!async_pf_cache) return -ENOMEM; return 0; } void kvm_async_pf_deinit(void) { kmem_cache_destroy(async_pf_cache); async_pf_cache = NULL; } void kvm_async_pf_vcpu_init(struct kvm_vcpu *vcpu) { INIT_LIST_HEAD(&vcpu->async_pf.done); INIT_LIST_HEAD(&vcpu->async_pf.queue); spin_lock_init(&vcpu->async_pf.lock); } static void async_pf_execute(struct work_struct *work) { struct kvm_async_pf *apf = container_of(work, struct kvm_async_pf, work); struct mm_struct *mm = apf->mm; struct kvm_vcpu *vcpu = apf->vcpu; unsigned long addr = apf->addr; gpa_t cr2_or_gpa = apf->cr2_or_gpa; int locked = 1; bool first; might_sleep(); /* * This work is run asynchronously to the task which owns * mm and might be done in another context, so we must * access remotely. */ mmap_read_lock(mm); get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, &locked); if (locked) mmap_read_unlock(mm); if (IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC)) kvm_arch_async_page_present(vcpu, apf); spin_lock(&vcpu->async_pf.lock); first = list_empty(&vcpu->async_pf.done); list_add_tail(&apf->link, &vcpu->async_pf.done); apf->vcpu = NULL; spin_unlock(&vcpu->async_pf.lock); if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first) kvm_arch_async_page_present_queued(vcpu); /* * apf may be freed by kvm_check_async_pf_completion() after * this point */ trace_kvm_async_pf_completed(addr, cr2_or_gpa); __kvm_vcpu_wake_up(vcpu); mmput(mm); kvm_put_kvm(vcpu->kvm); } void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) { spin_lock(&vcpu->async_pf.lock); /* cancel outstanding work queue item */ while (!list_empty(&vcpu->async_pf.queue)) { struct kvm_async_pf *work = list_first_entry(&vcpu->async_pf.queue, typeof(*work), queue); list_del(&work->queue); /* * We know it's present in vcpu->async_pf.done, do * nothing here. */ if (!work->vcpu) continue; spin_unlock(&vcpu->async_pf.lock); #ifdef CONFIG_KVM_ASYNC_PF_SYNC flush_work(&work->work); #else if (cancel_work_sync(&work->work)) { mmput(work->mm); kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */ kmem_cache_free(async_pf_cache, work); } #endif spin_lock(&vcpu->async_pf.lock); } while (!list_empty(&vcpu->async_pf.done)) { struct kvm_async_pf *work = list_first_entry(&vcpu->async_pf.done, typeof(*work), link); list_del(&work->link); kmem_cache_free(async_pf_cache, work); } spin_unlock(&vcpu->async_pf.lock); vcpu->async_pf.queued = 0; } void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) { struct kvm_async_pf *work; while (!list_empty_careful(&vcpu->async_pf.done) && kvm_arch_can_dequeue_async_page_present(vcpu)) { spin_lock(&vcpu->async_pf.lock); work = list_first_entry(&vcpu->async_pf.done, typeof(*work), link); list_del(&work->link); spin_unlock(&vcpu->async_pf.lock); kvm_arch_async_page_ready(vcpu, work); if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC)) kvm_arch_async_page_present(vcpu, work); list_del(&work->queue); vcpu->async_pf.queued--; kmem_cache_free(async_pf_cache, work); } } /* * Try to schedule a job to handle page fault asynchronously. Returns 'true' on * success, 'false' on failure (page fault has to be handled synchronously). */ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, unsigned long hva, struct kvm_arch_async_pf *arch) { struct kvm_async_pf *work; if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU) return false; /* Arch specific code should not do async PF in this case */ if (unlikely(kvm_is_error_hva(hva))) return false; /* * do alloc nowait since if we are going to sleep anyway we * may as well sleep faulting in page */ work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT | __GFP_NOWARN); if (!work) return false; work->wakeup_all = false; work->vcpu = vcpu; work->cr2_or_gpa = cr2_or_gpa; work->addr = hva; work->arch = *arch; work->mm = current->mm; mmget(work->mm); kvm_get_kvm(work->vcpu->kvm); INIT_WORK(&work->work, async_pf_execute); list_add_tail(&work->queue, &vcpu->async_pf.queue); vcpu->async_pf.queued++; work->notpresent_injected = kvm_arch_async_page_not_present(vcpu, work); schedule_work(&work->work); return true; } int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) { struct kvm_async_pf *work; bool first; if (!list_empty_careful(&vcpu->async_pf.done)) return 0; work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC); if (!work) return -ENOMEM; work->wakeup_all = true; INIT_LIST_HEAD(&work->queue); /* for list_del to work */ spin_lock(&vcpu->async_pf.lock); first = list_empty(&vcpu->async_pf.done); list_add_tail(&work->link, &vcpu->async_pf.done); spin_unlock(&vcpu->async_pf.lock); if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first) kvm_arch_async_page_present_queued(vcpu); vcpu->async_pf.queued++; return 0; }
62 1269 527 1018 572 122 2 2 2 29 116 118 4 4 113 113 8 44 577 61 554 92 929 412 871 899 902 898 1031 63 1031 1025 232 481 88 92 450 88 16 63 63 60 62 61 1 62 4 4 4 9 9 2 1 1 1 1 2 2 2 2 2 2 2 62 62 62 62 62 62 62 62 62 62 60 60 2 59 16 16 16 16 4 9 12 164 164 164 62 4 3 60 60 46 16 56 1296 1297 713 713 191 191 191 30 30 924 1928 1462 924 1461 924 924 1927 1928 1 1914 372 1769 1913 190 191 2009 191 191 1958 1317 222 222 221 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 // SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/journal.c * * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 * * Copyright 1998 Red Hat corp --- All Rights Reserved * * Generic filesystem journal-writing code; part of the ext2fs * journaling system. * * This file manages journals: areas of disk reserved for logging * transactional updates. This includes the kernel journaling thread * which is responsible for scheduling updates to the log. * * We do not actually manage the physical storage of the journal in this * file: that is left to a per-journal policy function, which allows us * to store the journal within a filesystem-specified area for ext2 * journaling (ext2 can use a reserved inode for storing the log). */ #include <linux/module.h> #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/freezer.h> #include <linux/pagemap.h> #include <linux/kthread.h> #include <linux/poison.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/math64.h> #include <linux/hash.h> #include <linux/log2.h> #include <linux/vmalloc.h> #include <linux/backing-dev.h> #include <linux/bitops.h> #include <linux/ratelimit.h> #include <linux/sched/mm.h> #define CREATE_TRACE_POINTS #include <trace/events/jbd2.h> #include <linux/uaccess.h> #include <asm/page.h> #ifdef CONFIG_JBD2_DEBUG static ushort jbd2_journal_enable_debug __read_mostly; module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644); MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2"); #endif EXPORT_SYMBOL(jbd2_journal_extend); EXPORT_SYMBOL(jbd2_journal_stop); EXPORT_SYMBOL(jbd2_journal_lock_updates); EXPORT_SYMBOL(jbd2_journal_unlock_updates); EXPORT_SYMBOL(jbd2_journal_get_write_access); EXPORT_SYMBOL(jbd2_journal_get_create_access); EXPORT_SYMBOL(jbd2_journal_get_undo_access); EXPORT_SYMBOL(jbd2_journal_set_triggers); EXPORT_SYMBOL(jbd2_journal_dirty_metadata); EXPORT_SYMBOL(jbd2_journal_forget); EXPORT_SYMBOL(jbd2_journal_flush); EXPORT_SYMBOL(jbd2_journal_revoke); EXPORT_SYMBOL(jbd2_journal_init_dev); EXPORT_SYMBOL(jbd2_journal_init_inode); EXPORT_SYMBOL(jbd2_journal_check_used_features); EXPORT_SYMBOL(jbd2_journal_check_available_features); EXPORT_SYMBOL(jbd2_journal_set_features); EXPORT_SYMBOL(jbd2_journal_load); EXPORT_SYMBOL(jbd2_journal_destroy); EXPORT_SYMBOL(jbd2_journal_abort); EXPORT_SYMBOL(jbd2_journal_errno); EXPORT_SYMBOL(jbd2_journal_ack_err); EXPORT_SYMBOL(jbd2_journal_clear_err); EXPORT_SYMBOL(jbd2_log_wait_commit); EXPORT_SYMBOL(jbd2_journal_start_commit); EXPORT_SYMBOL(jbd2_journal_force_commit_nested); EXPORT_SYMBOL(jbd2_journal_wipe); EXPORT_SYMBOL(jbd2_journal_blocks_per_page); EXPORT_SYMBOL(jbd2_journal_invalidate_folio); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait); EXPORT_SYMBOL(jbd2_journal_finish_inode_data_buffers); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); EXPORT_SYMBOL(jbd2_inode_cache); static int jbd2_journal_create_slab(size_t slab_size); #ifdef CONFIG_JBD2_DEBUG void __jbd2_debug(int level, const char *file, const char *func, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; if (level > jbd2_journal_enable_debug) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf); va_end(args); } #endif /* Checksumming functions */ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) { __u32 csum; __be32 old_csum; old_csum = sb->s_checksum; sb->s_checksum = 0; csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t)); sb->s_checksum = old_csum; return cpu_to_be32(csum); } /* * Helper function used to manage commit timeouts */ static void commit_timeout(struct timer_list *t) { journal_t *journal = from_timer(journal, t, j_commit_timer); wake_up_process(journal->j_task); } /* * kjournald2: The main thread function used to manage a logging device * journal. * * This kernel thread is responsible for two things: * * 1) COMMIT: Every so often we need to commit the current state of the * filesystem to disk. The journal thread is responsible for writing * all of the metadata buffers to disk. If a fast commit is ongoing * journal thread waits until it's done and then continues from * there on. * * 2) CHECKPOINT: We cannot reuse a used section of the log file until all * of the data in that part of the log has been rewritten elsewhere on * the disk. Flushing these old buffers to reclaim space in the log is * known as checkpointing, and this thread is responsible for that job. */ static int kjournald2(void *arg) { journal_t *journal = arg; transaction_t *transaction; /* * Set up an interval timer which can be used to trigger a commit wakeup * after the commit interval expires */ timer_setup(&journal->j_commit_timer, commit_timeout, 0); set_freezable(); /* Record that the journal thread is running */ journal->j_task = current; wake_up(&journal->j_wait_done_commit); /* * Make sure that no allocations from this kernel thread will ever * recurse to the fs layer because we are responsible for the * transaction commit and any fs involvement might get stuck waiting for * the trasn. commit. */ memalloc_nofs_save(); /* * And now, wait forever for commit wakeup events. */ write_lock(&journal->j_state_lock); loop: if (journal->j_flags & JBD2_UNMOUNT) goto end_loop; jbd2_debug(1, "commit_sequence=%u, commit_request=%u\n", journal->j_commit_sequence, journal->j_commit_request); if (journal->j_commit_sequence != journal->j_commit_request) { jbd2_debug(1, "OK, requests differ\n"); write_unlock(&journal->j_state_lock); del_timer_sync(&journal->j_commit_timer); jbd2_journal_commit_transaction(journal); write_lock(&journal->j_state_lock); goto loop; } wake_up(&journal->j_wait_done_commit); if (freezing(current)) { /* * The simpler the better. Flushing journal isn't a * good idea, because that depends on threads that may * be already stopped. */ jbd2_debug(1, "Now suspending kjournald2\n"); write_unlock(&journal->j_state_lock); try_to_freeze(); write_lock(&journal->j_state_lock); } else { /* * We assume on resume that commits are already there, * so we don't sleep */ DEFINE_WAIT(wait); int should_sleep = 1; prepare_to_wait(&journal->j_wait_commit, &wait, TASK_INTERRUPTIBLE); if (journal->j_commit_sequence != journal->j_commit_request) should_sleep = 0; transaction = journal->j_running_transaction; if (transaction && time_after_eq(jiffies, transaction->t_expires)) should_sleep = 0; if (journal->j_flags & JBD2_UNMOUNT) should_sleep = 0; if (should_sleep) { write_unlock(&journal->j_state_lock); schedule(); write_lock(&journal->j_state_lock); } finish_wait(&journal->j_wait_commit, &wait); } jbd2_debug(1, "kjournald2 wakes\n"); /* * Were we woken up by a commit wakeup event? */ transaction = journal->j_running_transaction; if (transaction && time_after_eq(jiffies, transaction->t_expires)) { journal->j_commit_request = transaction->t_tid; jbd2_debug(1, "woke because of timeout\n"); } goto loop; end_loop: del_timer_sync(&journal->j_commit_timer); journal->j_task = NULL; wake_up(&journal->j_wait_done_commit); jbd2_debug(1, "Journal thread exiting.\n"); write_unlock(&journal->j_state_lock); return 0; } static int jbd2_journal_start_thread(journal_t *journal) { struct task_struct *t; t = kthread_run(kjournald2, journal, "jbd2/%s", journal->j_devname); if (IS_ERR(t)) return PTR_ERR(t); wait_event(journal->j_wait_done_commit, journal->j_task != NULL); return 0; } static void journal_kill_thread(journal_t *journal) { write_lock(&journal->j_state_lock); journal->j_flags |= JBD2_UNMOUNT; while (journal->j_task) { write_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_commit); wait_event(journal->j_wait_done_commit, journal->j_task == NULL); write_lock(&journal->j_state_lock); } write_unlock(&journal->j_state_lock); } /* * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal. * * Writes a metadata buffer to a given disk block. The actual IO is not * performed but a new buffer_head is constructed which labels the data * to be written with the correct destination disk block. * * Any magic-number escaping which needs to be done will cause a * copy-out here. If the buffer happens to start with the * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the * magic number is only written to the log for descripter blocks. In * this case, we copy the data and replace the first word with 0, and we * return a result code which indicates that this buffer needs to be * marked as an escaped buffer in the corresponding log descriptor * block. The missing word can then be restored when the block is read * during recovery. * * If the source buffer has already been modified by a new transaction * since we took the last commit snapshot, we use the frozen copy of * that data for IO. If we end up using the existing buffer_head's data * for the write, then we have to make sure nobody modifies it while the * IO is in progress. do_get_write_access() handles this. * * The function returns a pointer to the buffer_head to be used for IO. * * * Return value: * <0: Error * >=0: Finished OK * * On success: * Bit 0 set == escape performed on the data * Bit 1 set == buffer copy-out performed (kfree the data after IO) */ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, struct buffer_head **bh_out, sector_t blocknr) { int need_copy_out = 0; int done_copy_out = 0; int do_escape = 0; char *mapped_data; struct buffer_head *new_bh; struct folio *new_folio; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); journal_t *journal = transaction->t_journal; /* * The buffer really shouldn't be locked: only the current committing * transaction is allowed to write it, so nobody else is allowed * to do any IO. * * akpm: except if we're journalling data, and write() output is * also part of a shared mapping, and another thread has * decided to launch a writepage() against this buffer. */ J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); /* keep subsequent assertions sane */ atomic_set(&new_bh->b_count, 1); spin_lock(&jh_in->b_state_lock); repeat: /* * If a new transaction has already done a buffer copy-out, then * we use that version of the data for the commit. */ if (jh_in->b_frozen_data) { done_copy_out = 1; new_folio = virt_to_folio(jh_in->b_frozen_data); new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); } else { new_folio = jh2bh(jh_in)->b_folio; new_offset = offset_in_folio(new_folio, jh2bh(jh_in)->b_data); } mapped_data = kmap_local_folio(new_folio, new_offset); /* * Fire data frozen trigger if data already wasn't frozen. Do this * before checking for escaping, as the trigger may modify the magic * offset. If a copy-out happens afterwards, it will have the correct * data in the buffer. */ if (!done_copy_out) jbd2_buffer_frozen_trigger(jh_in, mapped_data, jh_in->b_triggers); /* * Check for escaping */ if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) { need_copy_out = 1; do_escape = 1; } kunmap_local(mapped_data); /* * Do we need to do a data copy? */ if (need_copy_out && !done_copy_out) { char *tmp; spin_unlock(&jh_in->b_state_lock); tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); if (!tmp) { brelse(new_bh); return -ENOMEM; } spin_lock(&jh_in->b_state_lock); if (jh_in->b_frozen_data) { jbd2_free(tmp, bh_in->b_size); goto repeat; } jh_in->b_frozen_data = tmp; memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size); new_folio = virt_to_folio(tmp); new_offset = offset_in_folio(new_folio, tmp); done_copy_out = 1; /* * This isn't strictly necessary, as we're using frozen * data for the escaping, but it keeps consistency with * b_frozen_data usage. */ jh_in->b_frozen_triggers = jh_in->b_triggers; } /* * Did we need to do an escaping? Now we've done all the * copying, we can finally do so. */ if (do_escape) { mapped_data = kmap_local_folio(new_folio, new_offset); *((unsigned int *)mapped_data) = 0; kunmap_local(mapped_data); } folio_set_bh(new_bh, new_folio, new_offset); new_bh->b_size = bh_in->b_size; new_bh->b_bdev = journal->j_dev; new_bh->b_blocknr = blocknr; new_bh->b_private = bh_in; set_buffer_mapped(new_bh); set_buffer_dirty(new_bh); *bh_out = new_bh; /* * The to-be-written buffer needs to get moved to the io queue, * and the original buffer whose contents we are shadowing or * copying is moved to the transaction's shadow queue. */ JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); spin_unlock(&journal->j_list_lock); set_buffer_shadow(bh_in); spin_unlock(&jh_in->b_state_lock); return do_escape | (done_copy_out << 1); } /* * Allocation code for the journal file. Manage the space left in the * journal, so that we can begin checkpointing when appropriate. */ /* * Called with j_state_lock locked for writing. * Returns true if a transaction commit was started. */ static int __jbd2_log_start_commit(journal_t *journal, tid_t target) { /* Return if the txn has already requested to be committed */ if (journal->j_commit_request == target) return 0; /* * The only transaction we can possibly wait upon is the * currently running transaction (if it exists). Otherwise, * the target tid must be an old one. */ if (journal->j_running_transaction && journal->j_running_transaction->t_tid == target) { /* * We want a new commit: OK, mark the request and wakeup the * commit thread. We do _not_ do the commit ourselves. */ journal->j_commit_request = target; jbd2_debug(1, "JBD2: requesting commit %u/%u\n", journal->j_commit_request, journal->j_commit_sequence); journal->j_running_transaction->t_requested = jiffies; wake_up(&journal->j_wait_commit); return 1; } else if (!tid_geq(journal->j_commit_request, target)) /* This should never happen, but if it does, preserve the evidence before kjournald goes into a loop and increments j_commit_sequence beyond all recognition. */ WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n", journal->j_commit_request, journal->j_commit_sequence, target, journal->j_running_transaction ? journal->j_running_transaction->t_tid : 0); return 0; } int jbd2_log_start_commit(journal_t *journal, tid_t tid) { int ret; write_lock(&journal->j_state_lock); ret = __jbd2_log_start_commit(journal, tid); write_unlock(&journal->j_state_lock); return ret; } /* * Force and wait any uncommitted transactions. We can only force the running * transaction if we don't have an active handle, otherwise, we will deadlock. * Returns: <0 in case of error, * 0 if nothing to commit, * 1 if transaction was successfully committed. */ static int __jbd2_journal_force_commit(journal_t *journal) { transaction_t *transaction = NULL; tid_t tid; int need_to_start = 0, ret = 0; read_lock(&journal->j_state_lock); if (journal->j_running_transaction && !current->journal_info) { transaction = journal->j_running_transaction; if (!tid_geq(journal->j_commit_request, transaction->t_tid)) need_to_start = 1; } else if (journal->j_committing_transaction) transaction = journal->j_committing_transaction; if (!transaction) { /* Nothing to commit */ read_unlock(&journal->j_state_lock); return 0; } tid = transaction->t_tid; read_unlock(&journal->j_state_lock); if (need_to_start) jbd2_log_start_commit(journal, tid); ret = jbd2_log_wait_commit(journal, tid); if (!ret) ret = 1; return ret; } /** * jbd2_journal_force_commit_nested - Force and wait upon a commit if the * calling process is not within transaction. * * @journal: journal to force * Returns true if progress was made. * * This is used for forcing out undo-protected data which contains * bitmaps, when the fs is running out of space. */ int jbd2_journal_force_commit_nested(journal_t *journal) { int ret; ret = __jbd2_journal_force_commit(journal); return ret > 0; } /** * jbd2_journal_force_commit() - force any uncommitted transactions * @journal: journal to force * * Caller want unconditional commit. We can only force the running transaction * if we don't have an active handle, otherwise, we will deadlock. */ int jbd2_journal_force_commit(journal_t *journal) { int ret; J_ASSERT(!current->journal_info); ret = __jbd2_journal_force_commit(journal); if (ret > 0) ret = 0; return ret; } /* * Start a commit of the current running transaction (if any). Returns true * if a transaction is going to be committed (or is currently already * committing), and fills its tid in at *ptid */ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) { int ret = 0; write_lock(&journal->j_state_lock); if (journal->j_running_transaction) { tid_t tid = journal->j_running_transaction->t_tid; __jbd2_log_start_commit(journal, tid); /* There's a running transaction and we've just made sure * it's commit has been scheduled. */ if (ptid) *ptid = tid; ret = 1; } else if (journal->j_committing_transaction) { /* * If commit has been started, then we have to wait for * completion of that transaction. */ if (ptid) *ptid = journal->j_committing_transaction->t_tid; ret = 1; } write_unlock(&journal->j_state_lock); return ret; } /* * Return 1 if a given transaction has not yet sent barrier request * connected with a transaction commit. If 0 is returned, transaction * may or may not have sent the barrier. Used to avoid sending barrier * twice in common cases. */ int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) { int ret = 0; transaction_t *commit_trans; if (!(journal->j_flags & JBD2_BARRIER)) return 0; read_lock(&journal->j_state_lock); /* Transaction already committed? */ if (tid_geq(journal->j_commit_sequence, tid)) goto out; commit_trans = journal->j_committing_transaction; if (!commit_trans || commit_trans->t_tid != tid) { ret = 1; goto out; } /* * Transaction is being committed and we already proceeded to * submitting a flush to fs partition? */ if (journal->j_fs_dev != journal->j_dev) { if (!commit_trans->t_need_data_flush || commit_trans->t_state >= T_COMMIT_DFLUSH) goto out; } else { if (commit_trans->t_state >= T_COMMIT_JFLUSH) goto out; } ret = 1; out: read_unlock(&journal->j_state_lock); return ret; } EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier); /* * Wait for a specified commit to complete. * The caller may not hold the journal lock. */ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) { int err = 0; read_lock(&journal->j_state_lock); #ifdef CONFIG_PROVE_LOCKING /* * Some callers make sure transaction is already committing and in that * case we cannot block on open handles anymore. So don't warn in that * case. */ if (tid_gt(tid, journal->j_commit_sequence) && (!journal->j_committing_transaction || journal->j_committing_transaction->t_tid != tid)) { read_unlock(&journal->j_state_lock); jbd2_might_wait_for_commit(journal); read_lock(&journal->j_state_lock); } #endif #ifdef CONFIG_JBD2_DEBUG if (!tid_geq(journal->j_commit_request, tid)) { printk(KERN_ERR "%s: error: j_commit_request=%u, tid=%u\n", __func__, journal->j_commit_request, tid); } #endif while (tid_gt(tid, journal->j_commit_sequence)) { jbd2_debug(1, "JBD2: want %u, j_commit_sequence=%u\n", tid, journal->j_commit_sequence); read_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_commit); wait_event(journal->j_wait_done_commit, !tid_gt(tid, journal->j_commit_sequence)); read_lock(&journal->j_state_lock); } read_unlock(&journal->j_state_lock); if (unlikely(is_journal_aborted(journal))) err = -EIO; return err; } /* * Start a fast commit. If there's an ongoing fast or full commit wait for * it to complete. Returns 0 if a new fast commit was started. Returns -EALREADY * if a fast commit is not needed, either because there's an already a commit * going on or this tid has already been committed. Returns -EINVAL if no jbd2 * commit has yet been performed. */ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) { if (unlikely(is_journal_aborted(journal))) return -EIO; /* * Fast commits only allowed if at least one full commit has * been processed. */ if (!journal->j_stats.ts_tid) return -EINVAL; write_lock(&journal->j_state_lock); if (tid <= journal->j_commit_sequence) { write_unlock(&journal->j_state_lock); return -EALREADY; } if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING || (journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) { DEFINE_WAIT(wait); prepare_to_wait(&journal->j_fc_wait, &wait, TASK_UNINTERRUPTIBLE); write_unlock(&journal->j_state_lock); schedule(); finish_wait(&journal->j_fc_wait, &wait); return -EALREADY; } journal->j_flags |= JBD2_FAST_COMMIT_ONGOING; write_unlock(&journal->j_state_lock); jbd2_journal_lock_updates(journal); return 0; } EXPORT_SYMBOL(jbd2_fc_begin_commit); /* * Stop a fast commit. If fallback is set, this function starts commit of * TID tid before any other fast commit can start. */ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) { jbd2_journal_unlock_updates(journal); if (journal->j_fc_cleanup_callback) journal->j_fc_cleanup_callback(journal, 0, tid); write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; if (fallback) journal->j_flags |= JBD2_FULL_COMMIT_ONGOING; write_unlock(&journal->j_state_lock); wake_up(&journal->j_fc_wait); if (fallback) return jbd2_complete_transaction(journal, tid); return 0; } int jbd2_fc_end_commit(journal_t *journal) { return __jbd2_fc_end_commit(journal, 0, false); } EXPORT_SYMBOL(jbd2_fc_end_commit); int jbd2_fc_end_commit_fallback(journal_t *journal) { tid_t tid; read_lock(&journal->j_state_lock); tid = journal->j_running_transaction ? journal->j_running_transaction->t_tid : 0; read_unlock(&journal->j_state_lock); return __jbd2_fc_end_commit(journal, tid, true); } EXPORT_SYMBOL(jbd2_fc_end_commit_fallback); /* Return 1 when transaction with given tid has already committed. */ int jbd2_transaction_committed(journal_t *journal, tid_t tid) { int ret = 1; read_lock(&journal->j_state_lock); if (journal->j_running_transaction && journal->j_running_transaction->t_tid == tid) ret = 0; if (journal->j_committing_transaction && journal->j_committing_transaction->t_tid == tid) ret = 0; read_unlock(&journal->j_state_lock); return ret; } EXPORT_SYMBOL(jbd2_transaction_committed); /* * When this function returns the transaction corresponding to tid * will be completed. If the transaction has currently running, start * committing that transaction before waiting for it to complete. If * the transaction id is stale, it is by definition already completed, * so just return SUCCESS. */ int jbd2_complete_transaction(journal_t *journal, tid_t tid) { int need_to_wait = 1; read_lock(&journal->j_state_lock); if (journal->j_running_transaction && journal->j_running_transaction->t_tid == tid) { if (journal->j_commit_request != tid) { /* transaction not yet started, so request it */ read_unlock(&journal->j_state_lock); jbd2_log_start_commit(journal, tid); goto wait_commit; } } else if (!(journal->j_committing_transaction && journal->j_committing_transaction->t_tid == tid)) need_to_wait = 0; read_unlock(&journal->j_state_lock); if (!need_to_wait) return 0; wait_commit: return jbd2_log_wait_commit(journal, tid); } EXPORT_SYMBOL(jbd2_complete_transaction); /* * Log buffer allocation routines: */ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp) { unsigned long blocknr; write_lock(&journal->j_state_lock); J_ASSERT(journal->j_free > 1); blocknr = journal->j_head; journal->j_head++; journal->j_free--; if (journal->j_head == journal->j_last) journal->j_head = journal->j_first; write_unlock(&journal->j_state_lock); return jbd2_journal_bmap(journal, blocknr, retp); } /* Map one fast commit buffer for use by the file system */ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) { unsigned long long pblock; unsigned long blocknr; int ret = 0; struct buffer_head *bh; int fc_off; *bh_out = NULL; if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) { fc_off = journal->j_fc_off; blocknr = journal->j_fc_first + fc_off; journal->j_fc_off++; } else { ret = -EINVAL; } if (ret) return ret; ret = jbd2_journal_bmap(journal, blocknr, &pblock); if (ret) return ret; bh = __getblk(journal->j_dev, pblock, journal->j_blocksize); if (!bh) return -ENOMEM; journal->j_fc_wbuf[fc_off] = bh; *bh_out = bh; return 0; } EXPORT_SYMBOL(jbd2_fc_get_buf); /* * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf * for completion. */ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks) { struct buffer_head *bh; int i, j_fc_off; j_fc_off = journal->j_fc_off; /* * Wait in reverse order to minimize chances of us being woken up before * all IOs have completed */ for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) { bh = journal->j_fc_wbuf[i]; wait_on_buffer(bh); /* * Update j_fc_off so jbd2_fc_release_bufs can release remain * buffer head. */ if (unlikely(!buffer_uptodate(bh))) { journal->j_fc_off = i + 1; return -EIO; } put_bh(bh); journal->j_fc_wbuf[i] = NULL; } return 0; } EXPORT_SYMBOL(jbd2_fc_wait_bufs); int jbd2_fc_release_bufs(journal_t *journal) { struct buffer_head *bh; int i, j_fc_off; j_fc_off = journal->j_fc_off; for (i = j_fc_off - 1; i >= 0; i--) { bh = journal->j_fc_wbuf[i]; if (!bh) break; put_bh(bh); journal->j_fc_wbuf[i] = NULL; } return 0; } EXPORT_SYMBOL(jbd2_fc_release_bufs); /* * Conversion of logical to physical block numbers for the journal * * On external journals the journal blocks are identity-mapped, so * this is a no-op. If needed, we can use j_blk_offset - everything is * ready. */ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, unsigned long long *retp) { int err = 0; unsigned long long ret; sector_t block = blocknr; if (journal->j_bmap) { err = journal->j_bmap(journal, &block); if (err == 0) *retp = block; } else if (journal->j_inode) { ret = bmap(journal->j_inode, &block); if (ret || !block) { printk(KERN_ALERT "%s: journal block not found " "at offset %lu on %s\n", __func__, blocknr, journal->j_devname); err = -EIO; jbd2_journal_abort(journal, err); } else { *retp = block; } } else { *retp = blocknr; /* +journal->j_blk_offset */ } return err; } /* * We play buffer_head aliasing tricks to write data/metadata blocks to * the journal without copying their contents, but for journal * descriptor blocks we do need to generate bona fide buffers. * * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying * the buffer's contents they really should run flush_dcache_page(bh->b_page). * But we don't bother doing that, so there will be coherency problems with * mmaps of blockdevs which hold live JBD-controlled filesystems. */ struct buffer_head * jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type) { journal_t *journal = transaction->t_journal; struct buffer_head *bh; unsigned long long blocknr; journal_header_t *header; int err; err = jbd2_journal_next_log_block(journal, &blocknr); if (err) return NULL; bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); if (!bh) return NULL; atomic_dec(&transaction->t_outstanding_credits); lock_buffer(bh); memset(bh->b_data, 0, journal->j_blocksize); header = (journal_header_t *)bh->b_data; header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); header->h_blocktype = cpu_to_be32(type); header->h_sequence = cpu_to_be32(transaction->t_tid); set_buffer_uptodate(bh); unlock_buffer(bh); BUFFER_TRACE(bh, "return this buffer"); return bh; } void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh) { struct jbd2_journal_block_tail *tail; __u32 csum; if (!jbd2_journal_has_csum_v2or3(j)) return; tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); tail->t_checksum = 0; csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); tail->t_checksum = cpu_to_be32(csum); } /* * Return tid of the oldest transaction in the journal and block in the journal * where the transaction starts. * * If the journal is now empty, return which will be the next transaction ID * we will write and where will that transaction start. * * The return value is 0 if journal tail cannot be pushed any further, 1 if * it can. */ int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, unsigned long *block) { transaction_t *transaction; int ret; read_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); transaction = journal->j_checkpoint_transactions; if (transaction) { *tid = transaction->t_tid; *block = transaction->t_log_start; } else if ((transaction = journal->j_committing_transaction) != NULL) { *tid = transaction->t_tid; *block = transaction->t_log_start; } else if ((transaction = journal->j_running_transaction) != NULL) { *tid = transaction->t_tid; *block = journal->j_head; } else { *tid = journal->j_transaction_sequence; *block = journal->j_head; } ret = tid_gt(*tid, journal->j_tail_sequence); spin_unlock(&journal->j_list_lock); read_unlock(&journal->j_state_lock); return ret; } /* * Update information in journal structure and in on disk journal superblock * about log tail. This function does not check whether information passed in * really pushes log tail further. It's responsibility of the caller to make * sure provided log tail information is valid (e.g. by holding * j_checkpoint_mutex all the time between computing log tail and calling this * function as is the case with jbd2_cleanup_journal_tail()). * * Requires j_checkpoint_mutex */ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) { unsigned long freed; int ret; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); /* * We cannot afford for write to remain in drive's caches since as * soon as we update j_tail, next transaction can start reusing journal * space and if we lose sb update during power failure we'd replay * old transaction with possibly newly overwritten data. */ ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA); if (ret) goto out; write_lock(&journal->j_state_lock); freed = block - journal->j_tail; if (block < journal->j_tail) freed += journal->j_last - journal->j_first; trace_jbd2_update_log_tail(journal, tid, block, freed); jbd2_debug(1, "Cleaning journal tail from %u to %u (offset %lu), " "freeing %lu\n", journal->j_tail_sequence, tid, block, freed); journal->j_free += freed; journal->j_tail_sequence = tid; journal->j_tail = block; write_unlock(&journal->j_state_lock); out: return ret; } /* * This is a variation of __jbd2_update_log_tail which checks for validity of * provided log tail and locks j_checkpoint_mutex. So it is safe against races * with other threads updating log tail. */ void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) { mutex_lock_io(&journal->j_checkpoint_mutex); if (tid_gt(tid, journal->j_tail_sequence)) __jbd2_update_log_tail(journal, tid, block); mutex_unlock(&journal->j_checkpoint_mutex); } struct jbd2_stats_proc_session { journal_t *journal; struct transaction_stats_s *stats; int start; int max; }; static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) { return *pos ? NULL : SEQ_START_TOKEN; } static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return NULL; } static int jbd2_seq_info_show(struct seq_file *seq, void *v) { struct jbd2_stats_proc_session *s = seq->private; if (v != SEQ_START_TOKEN) return 0; seq_printf(seq, "%lu transactions (%lu requested), " "each up to %u blocks\n", s->stats->ts_tid, s->stats->ts_requested, s->journal->j_max_transaction_buffers); if (s->stats->ts_tid == 0) return 0; seq_printf(seq, "average: \n %ums waiting for transaction\n", jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid)); seq_printf(seq, " %ums request delay\n", (s->stats->ts_requested == 0) ? 0 : jiffies_to_msecs(s->stats->run.rs_request_delay / s->stats->ts_requested)); seq_printf(seq, " %ums running transaction\n", jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid)); seq_printf(seq, " %ums transaction was being locked\n", jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid)); seq_printf(seq, " %ums flushing data (in ordered mode)\n", jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid)); seq_printf(seq, " %ums logging transaction\n", jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid)); seq_printf(seq, " %lluus average transaction commit time\n", div_u64(s->journal->j_average_commit_time, 1000)); seq_printf(seq, " %lu handles per transaction\n", s->stats->run.rs_handle_count / s->stats->ts_tid); seq_printf(seq, " %lu blocks per transaction\n", s->stats->run.rs_blocks / s->stats->ts_tid); seq_printf(seq, " %lu logged blocks per transaction\n", s->stats->run.rs_blocks_logged / s->stats->ts_tid); return 0; } static void jbd2_seq_info_stop(struct seq_file *seq, void *v) { } static const struct seq_operations jbd2_seq_info_ops = { .start = jbd2_seq_info_start, .next = jbd2_seq_info_next, .stop = jbd2_seq_info_stop, .show = jbd2_seq_info_show, }; static int jbd2_seq_info_open(struct inode *inode, struct file *file) { journal_t *journal = pde_data(inode); struct jbd2_stats_proc_session *s; int rc, size; s = kmalloc(sizeof(*s), GFP_KERNEL); if (s == NULL) return -ENOMEM; size = sizeof(struct transaction_stats_s); s->stats = kmalloc(size, GFP_KERNEL); if (s->stats == NULL) { kfree(s); return -ENOMEM; } spin_lock(&journal->j_history_lock); memcpy(s->stats, &journal->j_stats, size); s->journal = journal; spin_unlock(&journal->j_history_lock); rc = seq_open(file, &jbd2_seq_info_ops); if (rc == 0) { struct seq_file *m = file->private_data; m->private = s; } else { kfree(s->stats); kfree(s); } return rc; } static int jbd2_seq_info_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct jbd2_stats_proc_session *s = seq->private; kfree(s->stats); kfree(s); return seq_release(inode, file); } static const struct proc_ops jbd2_info_proc_ops = { .proc_open = jbd2_seq_info_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = jbd2_seq_info_release, }; static struct proc_dir_entry *proc_jbd2_stats; static void jbd2_stats_proc_init(journal_t *journal) { journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); if (journal->j_proc_entry) { proc_create_data("info", S_IRUGO, journal->j_proc_entry, &jbd2_info_proc_ops, journal); } } static void jbd2_stats_proc_exit(journal_t *journal) { remove_proc_entry("info", journal->j_proc_entry); remove_proc_entry(journal->j_devname, proc_jbd2_stats); } /* Minimum size of descriptor tag */ static int jbd2_min_tag_size(void) { /* * Tag with 32-bit block numbers does not use last four bytes of the * structure */ return sizeof(journal_block_tag_t) - 4; } /** * jbd2_journal_shrink_scan() * @shrink: shrinker to work on * @sc: reclaim request to process * * Scan the checkpointed buffer on the checkpoint list and release the * journal_head. */ static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { journal_t *journal = shrink->private_data; unsigned long nr_to_scan = sc->nr_to_scan; unsigned long nr_shrunk; unsigned long count; count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count); nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan); count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count); return nr_shrunk; } /** * jbd2_journal_shrink_count() * @shrink: shrinker to work on * @sc: reclaim request to process * * Count the number of checkpoint buffers on the checkpoint list. */ static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { journal_t *journal = shrink->private_data; unsigned long count; count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); trace_jbd2_shrink_count(journal, sc->nr_to_scan, count); return count; } /* * If the journal init or create aborts, we need to mark the journal * superblock as being NULL to prevent the journal destroy from writing * back a bogus superblock. */ static void journal_fail_superblock(journal_t *journal) { struct buffer_head *bh = journal->j_sb_buffer; brelse(bh); journal->j_sb_buffer = NULL; } /* * Check the superblock for a given journal, performing initial * validation of the format. */ static int journal_check_superblock(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; int num_fc_blks; int err = -EINVAL; if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { printk(KERN_WARNING "JBD2: no valid journal superblock found\n"); return err; } if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 && be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) { printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); return err; } if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { printk(KERN_WARNING "JBD2: journal file too short\n"); return err; } if (be32_to_cpu(sb->s_first) == 0 || be32_to_cpu(sb->s_first) >= journal->j_total_len) { printk(KERN_WARNING "JBD2: Invalid start block of journal: %u\n", be32_to_cpu(sb->s_first)); return err; } /* * If this is a V2 superblock, then we have to check the * features flags on it. */ if (!jbd2_format_support_feature(journal)) return 0; if ((sb->s_feature_ro_compat & ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || (sb->s_feature_incompat & ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { printk(KERN_WARNING "JBD2: Unrecognised features on journal\n"); return err; } num_fc_blks = jbd2_has_feature_fast_commit(journal) ? jbd2_journal_get_num_fc_blks(sb) : 0; if (be32_to_cpu(sb->s_maxlen) < JBD2_MIN_JOURNAL_BLOCKS || be32_to_cpu(sb->s_maxlen) - JBD2_MIN_JOURNAL_BLOCKS < num_fc_blks) { printk(KERN_ERR "JBD2: journal file too short %u,%d\n", be32_to_cpu(sb->s_maxlen), num_fc_blks); return err; } if (jbd2_has_feature_csum2(journal) && jbd2_has_feature_csum3(journal)) { /* Can't have checksum v2 and v3 at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " "at the same time!\n"); return err; } if (jbd2_journal_has_csum_v2or3_feature(journal) && jbd2_has_feature_checksum(journal)) { /* Can't have checksum v1 and v2 on at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " "at the same time!\n"); return err; } /* Load the checksum driver */ if (jbd2_journal_has_csum_v2or3_feature(journal)) { if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) { printk(KERN_ERR "JBD2: Unknown checksum type\n"); return err; } journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); err = PTR_ERR(journal->j_chksum_driver); journal->j_chksum_driver = NULL; return err; } /* Check superblock checksum */ if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { printk(KERN_ERR "JBD2: journal checksum error\n"); err = -EFSBADCRC; return err; } } return 0; } static int journal_revoke_records_per_block(journal_t *journal) { int record_size; int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t); if (jbd2_has_feature_64bit(journal)) record_size = 8; else record_size = 4; if (jbd2_journal_has_csum_v2or3(journal)) space -= sizeof(struct jbd2_journal_block_tail); return space / record_size; } /* * Load the on-disk journal superblock and read the key fields into the * journal_t. */ static int journal_load_superblock(journal_t *journal) { int err; struct buffer_head *bh; journal_superblock_t *sb; bh = getblk_unmovable(journal->j_dev, journal->j_blk_offset, journal->j_blocksize); if (bh) err = bh_read(bh, 0); if (!bh || err < 0) { pr_err("%s: Cannot read journal superblock\n", __func__); brelse(bh); return -EIO; } journal->j_sb_buffer = bh; sb = (journal_superblock_t *)bh->b_data; journal->j_superblock = sb; err = journal_check_superblock(journal); if (err) { journal_fail_superblock(journal); return err; } journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); journal->j_tail = be32_to_cpu(sb->s_start); journal->j_first = be32_to_cpu(sb->s_first); journal->j_errno = be32_to_cpu(sb->s_errno); journal->j_last = be32_to_cpu(sb->s_maxlen); if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) journal->j_total_len = be32_to_cpu(sb->s_maxlen); /* Precompute checksum seed for all metadata */ if (jbd2_journal_has_csum_v2or3(journal)) journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, sizeof(sb->s_uuid)); journal->j_revoke_records_per_block = journal_revoke_records_per_block(journal); if (jbd2_has_feature_fast_commit(journal)) { journal->j_fc_last = be32_to_cpu(sb->s_maxlen); journal->j_last = journal->j_fc_last - jbd2_journal_get_num_fc_blks(sb); journal->j_fc_first = journal->j_last + 1; journal->j_fc_off = 0; } return 0; } /* * Management for journal control blocks: functions to create and * destroy journal_t structures, and to initialise and read existing * journal blocks from disk. */ /* First: create and setup a journal_t object in memory. We initialise * very few fields yet: that has to wait until we have created the * journal structures from from scratch, or loaded them from disk. */ static journal_t *journal_init_common(struct block_device *bdev, struct block_device *fs_dev, unsigned long long start, int len, int blocksize) { static struct lock_class_key jbd2_trans_commit_key; journal_t *journal; int err; int n; journal = kzalloc(sizeof(*journal), GFP_KERNEL); if (!journal) return ERR_PTR(-ENOMEM); journal->j_blocksize = blocksize; journal->j_dev = bdev; journal->j_fs_dev = fs_dev; journal->j_blk_offset = start; journal->j_total_len = len; jbd2_init_fs_dev_write_error(journal); err = journal_load_superblock(journal); if (err) goto err_cleanup; init_waitqueue_head(&journal->j_wait_transaction_locked); init_waitqueue_head(&journal->j_wait_done_commit); init_waitqueue_head(&journal->j_wait_commit); init_waitqueue_head(&journal->j_wait_updates); init_waitqueue_head(&journal->j_wait_reserved); init_waitqueue_head(&journal->j_fc_wait); mutex_init(&journal->j_abort_mutex); mutex_init(&journal->j_barrier); mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); spin_lock_init(&journal->j_list_lock); spin_lock_init(&journal->j_history_lock); rwlock_init(&journal->j_state_lock); journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); journal->j_min_batch_time = 0; journal->j_max_batch_time = 15000; /* 15ms */ atomic_set(&journal->j_reserved_credits, 0); lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", &jbd2_trans_commit_key, 0); /* The journal is marked for error until we succeed with recovery! */ journal->j_flags = JBD2_ABORT; /* Set up a default-sized revoke table for the new mount. */ err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); if (err) goto err_cleanup; /* * journal descriptor can store up to n blocks, we need enough * buffers to write out full descriptor block. */ err = -ENOMEM; n = journal->j_blocksize / jbd2_min_tag_size(); journal->j_wbufsize = n; journal->j_fc_wbuf = NULL; journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *), GFP_KERNEL); if (!journal->j_wbuf) goto err_cleanup; err = percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL); if (err) goto err_cleanup; journal->j_shrink_transaction = NULL; journal->j_shrinker = shrinker_alloc(0, "jbd2-journal:(%u:%u)", MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); if (!journal->j_shrinker) { err = -ENOMEM; goto err_cleanup; } journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan; journal->j_shrinker->count_objects = jbd2_journal_shrink_count; journal->j_shrinker->batch = journal->j_max_transaction_buffers; journal->j_shrinker->private_data = journal; shrinker_register(journal->j_shrinker); return journal; err_cleanup: percpu_counter_destroy(&journal->j_checkpoint_jh_count); if (journal->j_chksum_driver) crypto_free_shash(journal->j_chksum_driver); kfree(journal->j_wbuf); jbd2_journal_destroy_revoke(journal); journal_fail_superblock(journal); kfree(journal); return ERR_PTR(err); } /* jbd2_journal_init_dev and jbd2_journal_init_inode: * * Create a journal structure assigned some fixed set of disk blocks to * the journal. We don't actually touch those disk blocks yet, but we * need to set up all of the mapping information to tell the journaling * system where the journal blocks are. * */ /** * journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure * @bdev: Block device on which to create the journal * @fs_dev: Device which hold journalled filesystem for this journal. * @start: Block nr Start of journal. * @len: Length of the journal in blocks. * @blocksize: blocksize of journalling device * * Returns: a newly created journal_t * * * jbd2_journal_init_dev creates a journal which maps a fixed contiguous * range of blocks on an arbitrary block device. * */ journal_t *jbd2_journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, unsigned long long start, int len, int blocksize) { journal_t *journal; journal = journal_init_common(bdev, fs_dev, start, len, blocksize); if (IS_ERR(journal)) return ERR_CAST(journal); snprintf(journal->j_devname, sizeof(journal->j_devname), "%pg", journal->j_dev); strreplace(journal->j_devname, '/', '!'); jbd2_stats_proc_init(journal); return journal; } /** * journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode. * @inode: An inode to create the journal in * * jbd2_journal_init_inode creates a journal which maps an on-disk inode as * the journal. The inode must exist already, must support bmap() and * must have all data blocks preallocated. */ journal_t *jbd2_journal_init_inode(struct inode *inode) { journal_t *journal; sector_t blocknr; int err = 0; blocknr = 0; err = bmap(inode, &blocknr); if (err || !blocknr) { pr_err("%s: Cannot locate journal superblock\n", __func__); return err ? ERR_PTR(err) : ERR_PTR(-EINVAL); } jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n", inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size, inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev, blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); if (IS_ERR(journal)) return ERR_CAST(journal); journal->j_inode = inode; snprintf(journal->j_devname, sizeof(journal->j_devname), "%pg-%lu", journal->j_dev, journal->j_inode->i_ino); strreplace(journal->j_devname, '/', '!'); jbd2_stats_proc_init(journal); return journal; } /* * Given a journal_t structure, initialise the various fields for * startup of a new journaling session. We use this both when creating * a journal, and after recovering an old journal to reset it for * subsequent use. */ static int journal_reset(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; unsigned long long first, last; first = be32_to_cpu(sb->s_first); last = be32_to_cpu(sb->s_maxlen); if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n", first, last); journal_fail_superblock(journal); return -EINVAL; } journal->j_first = first; journal->j_last = last; if (journal->j_head != 0 && journal->j_flags & JBD2_CYCLE_RECORD) { /* * Disable the cycled recording mode if the journal head block * number is not correct. */ if (journal->j_head < first || journal->j_head >= last) { printk(KERN_WARNING "JBD2: Incorrect Journal head block %lu, " "disable journal_cycle_record\n", journal->j_head); journal->j_head = journal->j_first; } } else { journal->j_head = journal->j_first; } journal->j_tail = journal->j_head; journal->j_free = journal->j_last - journal->j_first; journal->j_tail_sequence = journal->j_transaction_sequence; journal->j_commit_sequence = journal->j_transaction_sequence - 1; journal->j_commit_request = journal->j_commit_sequence; journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal); /* * Now that journal recovery is done, turn fast commits off here. This * way, if fast commit was enabled before the crash but if now FS has * disabled it, we don't enable fast commits. */ jbd2_clear_feature_fast_commit(journal); /* * As a special case, if the on-disk copy is already marked as needing * no recovery (s_start == 0), then we can safely defer the superblock * update until the next commit by setting JBD2_FLUSHED. This avoids * attempting a write to a potential-readonly device. */ if (sb->s_start == 0) { jbd2_debug(1, "JBD2: Skipping superblock update on recovered sb " "(start %ld, seq %u, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); journal->j_flags |= JBD2_FLUSHED; } else { /* Lock here to make assertions happy... */ mutex_lock_io(&journal->j_checkpoint_mutex); /* * Update log tail information. We use REQ_FUA since new * transaction will start reusing journal space and so we * must make sure information about current log tail is on * disk before that. */ jbd2_journal_update_sb_log_tail(journal, journal->j_tail_sequence, journal->j_tail, REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } return jbd2_journal_start_thread(journal); } /* * This function expects that the caller will have locked the journal * buffer head, and will return with it unlocked */ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) { struct buffer_head *bh = journal->j_sb_buffer; journal_superblock_t *sb = journal->j_superblock; int ret = 0; /* Buffer got discarded which means block device got invalidated */ if (!buffer_mapped(bh)) { unlock_buffer(bh); return -EIO; } /* * Always set high priority flags to exempt from block layer's * QOS policies, e.g. writeback throttle. */ write_flags |= JBD2_JOURNAL_REQ_FLAGS; if (!(journal->j_flags & JBD2_BARRIER)) write_flags &= ~(REQ_FUA | REQ_PREFLUSH); trace_jbd2_write_superblock(journal, write_flags); if (buffer_write_io_error(bh)) { /* * Oh, dear. A previous attempt to write the journal * superblock failed. This could happen because the * USB device was yanked out. Or it could happen to * be a transient write error and maybe the block will * be remapped. Nothing we can do but to retry the * write and hope for the best. */ printk(KERN_ERR "JBD2: previous I/O error detected " "for journal superblock update for %s.\n", journal->j_devname); clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); } if (jbd2_journal_has_csum_v2or3(journal)) sb->s_checksum = jbd2_superblock_csum(journal, sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; submit_bh(REQ_OP_WRITE | write_flags, bh); wait_on_buffer(bh); if (buffer_write_io_error(bh)) { clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); ret = -EIO; } if (ret) { printk(KERN_ERR "JBD2: I/O error when updating journal superblock for %s.\n", journal->j_devname); if (!is_journal_aborted(journal)) jbd2_journal_abort(journal, ret); } return ret; } /** * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk. * @journal: The journal to update. * @tail_tid: TID of the new transaction at the tail of the log * @tail_block: The first block of the transaction at the tail of the log * @write_flags: Flags for the journal sb write operation * * Update a journal's superblock information about log tail and write it to * disk, waiting for the IO to complete. */ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, unsigned long tail_block, blk_opf_t write_flags) { journal_superblock_t *sb = journal->j_superblock; int ret; if (is_journal_aborted(journal)) return -EIO; if (jbd2_check_fs_dev_write_error(journal)) { jbd2_journal_abort(journal, -EIO); return -EIO; } BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); jbd2_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", tail_block, tail_tid); lock_buffer(journal->j_sb_buffer); sb->s_sequence = cpu_to_be32(tail_tid); sb->s_start = cpu_to_be32(tail_block); ret = jbd2_write_superblock(journal, write_flags); if (ret) goto out; /* Log is no longer empty */ write_lock(&journal->j_state_lock); WARN_ON(!sb->s_sequence); journal->j_flags &= ~JBD2_FLUSHED; write_unlock(&journal->j_state_lock); out: return ret; } /** * jbd2_mark_journal_empty() - Mark on disk journal as empty. * @journal: The journal to update. * @write_flags: Flags for the journal sb write operation * * Update a journal's dynamic superblock fields to show that journal is empty. * Write updated superblock to disk waiting for IO to complete. */ static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags) { journal_superblock_t *sb = journal->j_superblock; bool had_fast_commit = false; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); lock_buffer(journal->j_sb_buffer); if (sb->s_start == 0) { /* Is it already empty? */ unlock_buffer(journal->j_sb_buffer); return; } jbd2_debug(1, "JBD2: Marking journal as empty (seq %u)\n", journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); sb->s_start = cpu_to_be32(0); sb->s_head = cpu_to_be32(journal->j_head); if (jbd2_has_feature_fast_commit(journal)) { /* * When journal is clean, no need to commit fast commit flag and * make file system incompatible with older kernels. */ jbd2_clear_feature_fast_commit(journal); had_fast_commit = true; } jbd2_write_superblock(journal, write_flags); if (had_fast_commit) jbd2_set_feature_fast_commit(journal); /* Log is no longer empty */ write_lock(&journal->j_state_lock); journal->j_flags |= JBD2_FLUSHED; write_unlock(&journal->j_state_lock); } /** * __jbd2_journal_erase() - Discard or zeroout journal blocks (excluding superblock) * @journal: The journal to erase. * @flags: A discard/zeroout request is sent for each physically contigous * region of the journal. Either JBD2_JOURNAL_FLUSH_DISCARD or * JBD2_JOURNAL_FLUSH_ZEROOUT must be set to determine which operation * to perform. * * Note: JBD2_JOURNAL_FLUSH_ZEROOUT attempts to use hardware offload. Zeroes * will be explicitly written if no hardware offload is available, see * blkdev_issue_zeroout for more details. */ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) { int err = 0; unsigned long block, log_offset; /* logical */ unsigned long long phys_block, block_start, block_stop; /* physical */ loff_t byte_start, byte_stop, byte_count; /* flags must be set to either discard or zeroout */ if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags || ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) return -EINVAL; if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !bdev_max_discard_sectors(journal->j_dev)) return -EOPNOTSUPP; /* * lookup block mapping and issue discard/zeroout for each * contiguous region */ log_offset = be32_to_cpu(journal->j_superblock->s_first); block_start = ~0ULL; for (block = log_offset; block < journal->j_total_len; block++) { err = jbd2_journal_bmap(journal, block, &phys_block); if (err) { pr_err("JBD2: bad block at offset %lu", block); return err; } if (block_start == ~0ULL) { block_start = phys_block; block_stop = block_start - 1; } /* * last block not contiguous with current block, * process last contiguous region and return to this block on * next loop */ if (phys_block != block_stop + 1) { block--; } else { block_stop++; /* * if this isn't the last block of journal, * no need to process now because next block may also * be part of this contiguous region */ if (block != journal->j_total_len - 1) continue; } /* * end of contiguous region or this is last block of journal, * take care of the region */ byte_start = block_start * journal->j_blocksize; byte_stop = block_stop * journal->j_blocksize; byte_count = (block_stop - block_start + 1) * journal->j_blocksize; truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping, byte_start, byte_stop); if (flags & JBD2_JOURNAL_FLUSH_DISCARD) { err = blkdev_issue_discard(journal->j_dev, byte_start >> SECTOR_SHIFT, byte_count >> SECTOR_SHIFT, GFP_NOFS); } else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) { err = blkdev_issue_zeroout(journal->j_dev, byte_start >> SECTOR_SHIFT, byte_count >> SECTOR_SHIFT, GFP_NOFS, 0); } if (unlikely(err != 0)) { pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu", err, block_start, block_stop); return err; } /* reset start and stop after processing a region */ block_start = ~0ULL; } return blkdev_issue_flush(journal->j_dev); } /** * jbd2_journal_update_sb_errno() - Update error in the journal. * @journal: The journal to update. * * Update a journal's errno. Write updated superblock to disk waiting for IO * to complete. */ void jbd2_journal_update_sb_errno(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; int errcode; lock_buffer(journal->j_sb_buffer); errcode = journal->j_errno; if (errcode == -ESHUTDOWN) errcode = 0; jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); sb->s_errno = cpu_to_be32(errcode); jbd2_write_superblock(journal, REQ_FUA); } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); /** * jbd2_journal_load() - Read journal from disk. * @journal: Journal to act on. * * Given a journal_t structure which tells us which disk blocks contain * a journal, read the journal from disk to initialise the in-memory * structures. */ int jbd2_journal_load(journal_t *journal) { int err; journal_superblock_t *sb = journal->j_superblock; /* * Create a slab for this blocksize */ err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize)); if (err) return err; /* Let the recovery code check whether it needs to recover any * data from the journal. */ err = jbd2_journal_recover(journal); if (err) { pr_warn("JBD2: journal recovery failed\n"); return err; } if (journal->j_failed_commit) { printk(KERN_ERR "JBD2: journal transaction %u on %s " "is corrupt.\n", journal->j_failed_commit, journal->j_devname); return -EFSCORRUPTED; } /* * clear JBD2_ABORT flag initialized in journal_init_common * here to update log tail information with the newest seq. */ journal->j_flags &= ~JBD2_ABORT; /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory * and reset them on disk. */ err = journal_reset(journal); if (err) { pr_warn("JBD2: journal reset failed\n"); return err; } journal->j_flags |= JBD2_LOADED; return 0; } /** * jbd2_journal_destroy() - Release a journal_t structure. * @journal: Journal to act on. * * Release a journal_t structure once it is no longer in use by the * journaled object. * Return <0 if we couldn't clean up the journal. */ int jbd2_journal_destroy(journal_t *journal) { int err = 0; /* Wait for the commit thread to wake up and die. */ journal_kill_thread(journal); /* Force a final log commit */ if (journal->j_running_transaction) jbd2_journal_commit_transaction(journal); /* Force any old transactions to disk */ /* Totally anal locking here... */ spin_lock(&journal->j_list_lock); while (journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); mutex_lock_io(&journal->j_checkpoint_mutex); err = jbd2_log_do_checkpoint(journal); mutex_unlock(&journal->j_checkpoint_mutex); /* * If checkpointing failed, just free the buffers to avoid * looping forever */ if (err) { jbd2_journal_destroy_checkpoint(journal); spin_lock(&journal->j_list_lock); break; } spin_lock(&journal->j_list_lock); } J_ASSERT(journal->j_running_transaction == NULL); J_ASSERT(journal->j_committing_transaction == NULL); J_ASSERT(journal->j_checkpoint_transactions == NULL); spin_unlock(&journal->j_list_lock); /* * OK, all checkpoint transactions have been checked, now check the * writeback errseq of fs dev and abort the journal if some buffer * failed to write back to the original location, otherwise the * filesystem may become inconsistent. */ if (!is_journal_aborted(journal) && jbd2_check_fs_dev_write_error(journal)) jbd2_journal_abort(journal, -EIO); if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { mutex_lock_io(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); journal->j_tail_sequence = ++journal->j_transaction_sequence; write_unlock(&journal->j_state_lock); jbd2_mark_journal_empty(journal, REQ_PREFLUSH | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } else err = -EIO; brelse(journal->j_sb_buffer); } if (journal->j_shrinker) { percpu_counter_destroy(&journal->j_checkpoint_jh_count); shrinker_free(journal->j_shrinker); } if (journal->j_proc_entry) jbd2_stats_proc_exit(journal); iput(journal->j_inode); if (journal->j_revoke) jbd2_journal_destroy_revoke(journal); if (journal->j_chksum_driver) crypto_free_shash(journal->j_chksum_driver); kfree(journal->j_fc_wbuf); kfree(journal->j_wbuf); kfree(journal); return err; } /** * jbd2_journal_check_used_features() - Check if features specified are used. * @journal: Journal to check. * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount * @incompat: bitmask of incompatible features * * Check whether the journal uses all of a given set of * features. Return true (non-zero) if it does. **/ int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { journal_superblock_t *sb; if (!compat && !ro && !incompat) return 1; if (!jbd2_format_support_feature(journal)) return 0; sb = journal->j_superblock; if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) return 1; return 0; } /** * jbd2_journal_check_available_features() - Check feature set in journalling layer * @journal: Journal to check. * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount * @incompat: bitmask of incompatible features * * Check whether the journaling code supports the use of * all of a given set of features on this journal. Return true * (non-zero) if it can. */ int jbd2_journal_check_available_features(journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { if (!compat && !ro && !incompat) return 1; if (!jbd2_format_support_feature(journal)) return 0; if ((compat & JBD2_KNOWN_COMPAT_FEATURES) == compat && (ro & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro && (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat) return 1; return 0; } static int jbd2_journal_initialize_fast_commit(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; unsigned long long num_fc_blks; num_fc_blks = jbd2_journal_get_num_fc_blks(sb); if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS) return -ENOSPC; /* Are we called twice? */ WARN_ON(journal->j_fc_wbuf != NULL); journal->j_fc_wbuf = kmalloc_array(num_fc_blks, sizeof(struct buffer_head *), GFP_KERNEL); if (!journal->j_fc_wbuf) return -ENOMEM; journal->j_fc_wbufsize = num_fc_blks; journal->j_fc_last = journal->j_last; journal->j_last = journal->j_fc_last - num_fc_blks; journal->j_fc_first = journal->j_last + 1; journal->j_fc_off = 0; journal->j_free = journal->j_last - journal->j_first; journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal); return 0; } /** * jbd2_journal_set_features() - Mark a given journal feature in the superblock * @journal: Journal to act on. * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount * @incompat: bitmask of incompatible features * * Mark a given journal feature as present on the * superblock. Returns true if the requested features could be set. * */ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { #define INCOMPAT_FEATURE_ON(f) \ ((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f))) #define COMPAT_FEATURE_ON(f) \ ((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f))) journal_superblock_t *sb; if (jbd2_journal_check_used_features(journal, compat, ro, incompat)) return 1; if (!jbd2_journal_check_available_features(journal, compat, ro, incompat)) return 0; /* If enabling v2 checksums, turn on v3 instead */ if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) { incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2; incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3; } /* Asking for checksumming v3 and v1? Only give them v3. */ if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 && compat & JBD2_FEATURE_COMPAT_CHECKSUM) compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM; jbd2_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", compat, ro, incompat); sb = journal->j_superblock; if (incompat & JBD2_FEATURE_INCOMPAT_FAST_COMMIT) { if (jbd2_journal_initialize_fast_commit(journal)) { pr_err("JBD2: Cannot enable fast commits.\n"); return 0; } } /* Load the checksum driver if necessary */ if ((journal->j_chksum_driver == NULL) && INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); journal->j_chksum_driver = NULL; return 0; } /* Precompute checksum seed for all metadata */ journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, sizeof(sb->s_uuid)); } lock_buffer(journal->j_sb_buffer); /* If enabling v3 checksums, update superblock */ if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { sb->s_checksum_type = JBD2_CRC32C_CHKSUM; sb->s_feature_compat &= ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); } /* If enabling v1 checksums, downgrade superblock */ if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM)) sb->s_feature_incompat &= ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 | JBD2_FEATURE_INCOMPAT_CSUM_V3); sb->s_feature_compat |= cpu_to_be32(compat); sb->s_feature_ro_compat |= cpu_to_be32(ro); sb->s_feature_incompat |= cpu_to_be32(incompat); unlock_buffer(journal->j_sb_buffer); journal->j_revoke_records_per_block = journal_revoke_records_per_block(journal); return 1; #undef COMPAT_FEATURE_ON #undef INCOMPAT_FEATURE_ON } /* * jbd2_journal_clear_features() - Clear a given journal feature in the * superblock * @journal: Journal to act on. * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount * @incompat: bitmask of incompatible features * * Clear a given journal feature as present on the * superblock. */ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { journal_superblock_t *sb; jbd2_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", compat, ro, incompat); sb = journal->j_superblock; sb->s_feature_compat &= ~cpu_to_be32(compat); sb->s_feature_ro_compat &= ~cpu_to_be32(ro); sb->s_feature_incompat &= ~cpu_to_be32(incompat); journal->j_revoke_records_per_block = journal_revoke_records_per_block(journal); } EXPORT_SYMBOL(jbd2_journal_clear_features); /** * jbd2_journal_flush() - Flush journal * @journal: Journal to act on. * @flags: optional operation on the journal blocks after the flush (see below) * * Flush all data for a given journal to disk and empty the journal. * Filesystems can use this when remounting readonly to ensure that * recovery does not need to happen on remount. Optionally, a discard or zeroout * can be issued on the journal blocks after flushing. * * flags: * JBD2_JOURNAL_FLUSH_DISCARD: issues discards for the journal blocks * JBD2_JOURNAL_FLUSH_ZEROOUT: issues zeroouts for the journal blocks */ int jbd2_journal_flush(journal_t *journal, unsigned int flags) { int err = 0; transaction_t *transaction = NULL; write_lock(&journal->j_state_lock); /* Force everything buffered to the log... */ if (journal->j_running_transaction) { transaction = journal->j_running_transaction; __jbd2_log_start_commit(journal, transaction->t_tid); } else if (journal->j_committing_transaction) transaction = journal->j_committing_transaction; /* Wait for the log commit to complete... */ if (transaction) { tid_t tid = transaction->t_tid; write_unlock(&journal->j_state_lock); jbd2_log_wait_commit(journal, tid); } else { write_unlock(&journal->j_state_lock); } /* ...and flush everything in the log out to disk. */ spin_lock(&journal->j_list_lock); while (!err && journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); mutex_lock_io(&journal->j_checkpoint_mutex); err = jbd2_log_do_checkpoint(journal); mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); if (is_journal_aborted(journal)) return -EIO; mutex_lock_io(&journal->j_checkpoint_mutex); if (!err) { err = jbd2_cleanup_journal_tail(journal); if (err < 0) { mutex_unlock(&journal->j_checkpoint_mutex); goto out; } err = 0; } /* Finally, mark the journal as really needing no recovery. * This sets s_start==0 in the underlying superblock, which is * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ jbd2_mark_journal_empty(journal, REQ_FUA); if (flags) err = __jbd2_journal_erase(journal, flags); mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); J_ASSERT(!journal->j_running_transaction); J_ASSERT(!journal->j_committing_transaction); J_ASSERT(!journal->j_checkpoint_transactions); J_ASSERT(journal->j_head == journal->j_tail); J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); write_unlock(&journal->j_state_lock); out: return err; } /** * jbd2_journal_wipe() - Wipe journal contents * @journal: Journal to act on. * @write: flag (see below) * * Wipe out all of the contents of a journal, safely. This will produce * a warning if the journal contains any valid recovery information. * Must be called between journal_init_*() and jbd2_journal_load(). * * If 'write' is non-zero, then we wipe out the journal on disk; otherwise * we merely suppress recovery. */ int jbd2_journal_wipe(journal_t *journal, int write) { int err; J_ASSERT (!(journal->j_flags & JBD2_LOADED)); if (!journal->j_tail) return 0; printk(KERN_WARNING "JBD2: %s recovery information on journal\n", write ? "Clearing" : "Ignoring"); err = jbd2_journal_skip_recovery(journal); if (write) { /* Lock to make assertions happy... */ mutex_lock_io(&journal->j_checkpoint_mutex); jbd2_mark_journal_empty(journal, REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } return err; } /** * jbd2_journal_abort () - Shutdown the journal immediately. * @journal: the journal to shutdown. * @errno: an error number to record in the journal indicating * the reason for the shutdown. * * Perform a complete, immediate shutdown of the ENTIRE * journal (not of a single transaction). This operation cannot be * undone without closing and reopening the journal. * * The jbd2_journal_abort function is intended to support higher level error * recovery mechanisms such as the ext2/ext3 remount-readonly error * mode. * * Journal abort has very specific semantics. Any existing dirty, * unjournaled buffers in the main filesystem will still be written to * disk by bdflush, but the journaling mechanism will be suspended * immediately and no further transaction commits will be honoured. * * Any dirty, journaled buffers will be written back to disk without * hitting the journal. Atomicity cannot be guaranteed on an aborted * filesystem, but we _do_ attempt to leave as much data as possible * behind for fsck to use for cleanup. * * Any attempt to get a new transaction handle on a journal which is in * ABORT state will just result in an -EROFS error return. A * jbd2_journal_stop on an existing handle will return -EIO if we have * entered abort state during the update. * * Recursive transactions are not disturbed by journal abort until the * final jbd2_journal_stop, which will receive the -EIO error. * * Finally, the jbd2_journal_abort call allows the caller to supply an errno * which will be recorded (if possible) in the journal superblock. This * allows a client to record failure conditions in the middle of a * transaction without having to complete the transaction to record the * failure to disk. ext3_error, for example, now uses this * functionality. * */ void jbd2_journal_abort(journal_t *journal, int errno) { transaction_t *transaction; /* * Lock the aborting procedure until everything is done, this avoid * races between filesystem's error handling flow (e.g. ext4_abort()), * ensure panic after the error info is written into journal's * superblock. */ mutex_lock(&journal->j_abort_mutex); /* * ESHUTDOWN always takes precedence because a file system check * caused by any other journal abort error is not required after * a shutdown triggered. */ write_lock(&journal->j_state_lock); if (journal->j_flags & JBD2_ABORT) { int old_errno = journal->j_errno; write_unlock(&journal->j_state_lock); if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) { journal->j_errno = errno; jbd2_journal_update_sb_errno(journal); } mutex_unlock(&journal->j_abort_mutex); return; } /* * Mark the abort as occurred and start current running transaction * to release all journaled buffer. */ pr_err("Aborting journal on device %s.\n", journal->j_devname); journal->j_flags |= JBD2_ABORT; journal->j_errno = errno; transaction = journal->j_running_transaction; if (transaction) __jbd2_log_start_commit(journal, transaction->t_tid); write_unlock(&journal->j_state_lock); /* * Record errno to the journal super block, so that fsck and jbd2 * layer could realise that a filesystem check is needed. */ jbd2_journal_update_sb_errno(journal); mutex_unlock(&journal->j_abort_mutex); } /** * jbd2_journal_errno() - returns the journal's error state. * @journal: journal to examine. * * This is the errno number set with jbd2_journal_abort(), the last * time the journal was mounted - if the journal was stopped * without calling abort this will be 0. * * If the journal has been aborted on this mount time -EROFS will * be returned. */ int jbd2_journal_errno(journal_t *journal) { int err; read_lock(&journal->j_state_lock); if (journal->j_flags & JBD2_ABORT) err = -EROFS; else err = journal->j_errno; read_unlock(&journal->j_state_lock); return err; } /** * jbd2_journal_clear_err() - clears the journal's error state * @journal: journal to act on. * * An error must be cleared or acked to take a FS out of readonly * mode. */ int jbd2_journal_clear_err(journal_t *journal) { int err = 0; write_lock(&journal->j_state_lock); if (journal->j_flags & JBD2_ABORT) err = -EROFS; else journal->j_errno = 0; write_unlock(&journal->j_state_lock); return err; } /** * jbd2_journal_ack_err() - Ack journal err. * @journal: journal to act on. * * An error must be cleared or acked to take a FS out of readonly * mode. */ void jbd2_journal_ack_err(journal_t *journal) { write_lock(&journal->j_state_lock); if (journal->j_errno) journal->j_flags |= JBD2_ACK_ERR; write_unlock(&journal->j_state_lock); } int jbd2_journal_blocks_per_page(struct inode *inode) { return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); } /* * helper functions to deal with 32 or 64bit block numbers. */ size_t journal_tag_bytes(journal_t *journal) { size_t sz; if (jbd2_has_feature_csum3(journal)) return sizeof(journal_block_tag3_t); sz = sizeof(journal_block_tag_t); if (jbd2_has_feature_csum2(journal)) sz += sizeof(__u16); if (jbd2_has_feature_64bit(journal)) return sz; else return sz - sizeof(__u32); } /* * JBD memory management * * These functions are used to allocate block-sized chunks of memory * used for making copies of buffer_head data. Very often it will be * page-sized chunks of data, but sometimes it will be in * sub-page-size chunks. (For example, 16k pages on Power systems * with a 4k block file system.) For blocks smaller than a page, we * use a SLAB allocator. There are slab caches for each block size, * which are allocated at mount time, if necessary, and we only free * (all of) the slab caches when/if the jbd2 module is unloaded. For * this reason we don't need to a mutex to protect access to * jbd2_slab[] allocating or releasing memory; only in * jbd2_journal_create_slab(). */ #define JBD2_MAX_SLABS 8 static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS]; static const char *jbd2_slab_names[JBD2_MAX_SLABS] = { "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k", "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k" }; static void jbd2_journal_destroy_slabs(void) { int i; for (i = 0; i < JBD2_MAX_SLABS; i++) { kmem_cache_destroy(jbd2_slab[i]); jbd2_slab[i] = NULL; } } static int jbd2_journal_create_slab(size_t size) { static DEFINE_MUTEX(jbd2_slab_create_mutex); int i = order_base_2(size) - 10; size_t slab_size; if (size == PAGE_SIZE) return 0; if (i >= JBD2_MAX_SLABS) return -EINVAL; if (unlikely(i < 0)) i = 0; mutex_lock(&jbd2_slab_create_mutex); if (jbd2_slab[i]) { mutex_unlock(&jbd2_slab_create_mutex); return 0; /* Already created */ } slab_size = 1 << (i+10); jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size, slab_size, 0, NULL); mutex_unlock(&jbd2_slab_create_mutex); if (!jbd2_slab[i]) { printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n"); return -ENOMEM; } return 0; } static struct kmem_cache *get_slab(size_t size) { int i = order_base_2(size) - 10; BUG_ON(i >= JBD2_MAX_SLABS); if (unlikely(i < 0)) i = 0; BUG_ON(jbd2_slab[i] == NULL); return jbd2_slab[i]; } void *jbd2_alloc(size_t size, gfp_t flags) { void *ptr; BUG_ON(size & (size-1)); /* Must be a power of 2 */ if (size < PAGE_SIZE) ptr = kmem_cache_alloc(get_slab(size), flags); else ptr = (void *)__get_free_pages(flags, get_order(size)); /* Check alignment; SLUB has gotten this wrong in the past, * and this can lead to user data corruption! */ BUG_ON(((unsigned long) ptr) & (size-1)); return ptr; } void jbd2_free(void *ptr, size_t size) { if (size < PAGE_SIZE) kmem_cache_free(get_slab(size), ptr); else free_pages((unsigned long)ptr, get_order(size)); }; /* * Journal_head storage management */ static struct kmem_cache *jbd2_journal_head_cache; #ifdef CONFIG_JBD2_DEBUG static atomic_t nr_journal_heads = ATOMIC_INIT(0); #endif static int __init jbd2_journal_init_journal_head_cache(void) { J_ASSERT(!jbd2_journal_head_cache); jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", sizeof(struct journal_head), 0, /* offset */ SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU, NULL); /* ctor */ if (!jbd2_journal_head_cache) { printk(KERN_EMERG "JBD2: no memory for journal_head cache\n"); return -ENOMEM; } return 0; } static void jbd2_journal_destroy_journal_head_cache(void) { kmem_cache_destroy(jbd2_journal_head_cache); jbd2_journal_head_cache = NULL; } /* * journal_head splicing and dicing */ static struct journal_head *journal_alloc_journal_head(void) { struct journal_head *ret; #ifdef CONFIG_JBD2_DEBUG atomic_inc(&nr_journal_heads); #endif ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); if (!ret) { jbd2_debug(1, "out of memory for journal_head\n"); pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS | __GFP_NOFAIL); } if (ret) spin_lock_init(&ret->b_state_lock); return ret; } static void journal_free_journal_head(struct journal_head *jh) { #ifdef CONFIG_JBD2_DEBUG atomic_dec(&nr_journal_heads); memset(jh, JBD2_POISON_FREE, sizeof(*jh)); #endif kmem_cache_free(jbd2_journal_head_cache, jh); } /* * A journal_head is attached to a buffer_head whenever JBD has an * interest in the buffer. * * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit * is set. This bit is tested in core kernel code where we need to take * JBD-specific actions. Testing the zeroness of ->b_private is not reliable * there. * * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. * * When a buffer has its BH_JBD bit set it is immune from being released by * core kernel code, mainly via ->b_count. * * A journal_head is detached from its buffer_head when the journal_head's * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint * transaction (b_cp_transaction) hold their references to b_jcount. * * Various places in the kernel want to attach a journal_head to a buffer_head * _before_ attaching the journal_head to a transaction. To protect the * journal_head in this situation, jbd2_journal_add_journal_head elevates the * journal_head's b_jcount refcount by one. The caller must call * jbd2_journal_put_journal_head() to undo this. * * So the typical usage would be: * * (Attach a journal_head if needed. Increments b_jcount) * struct journal_head *jh = jbd2_journal_add_journal_head(bh); * ... * (Get another reference for transaction) * jbd2_journal_grab_journal_head(bh); * jh->b_transaction = xxx; * (Put original reference) * jbd2_journal_put_journal_head(jh); */ /* * Give a buffer_head a journal_head. * * May sleep. */ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh) { struct journal_head *jh; struct journal_head *new_jh = NULL; repeat: if (!buffer_jbd(bh)) new_jh = journal_alloc_journal_head(); jbd_lock_bh_journal_head(bh); if (buffer_jbd(bh)) { jh = bh2jh(bh); } else { J_ASSERT_BH(bh, (atomic_read(&bh->b_count) > 0) || (bh->b_folio && bh->b_folio->mapping)); if (!new_jh) { jbd_unlock_bh_journal_head(bh); goto repeat; } jh = new_jh; new_jh = NULL; /* We consumed it */ set_buffer_jbd(bh); bh->b_private = jh; jh->b_bh = bh; get_bh(bh); BUFFER_TRACE(bh, "added journal_head"); } jh->b_jcount++; jbd_unlock_bh_journal_head(bh); if (new_jh) journal_free_journal_head(new_jh); return bh->b_private; } /* * Grab a ref against this buffer_head's journal_head. If it ended up not * having a journal_head, return NULL */ struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh) { struct journal_head *jh = NULL; jbd_lock_bh_journal_head(bh); if (buffer_jbd(bh)) { jh = bh2jh(bh); jh->b_jcount++; } jbd_unlock_bh_journal_head(bh); return jh; } EXPORT_SYMBOL(jbd2_journal_grab_journal_head); static void __journal_remove_journal_head(struct buffer_head *bh) { struct journal_head *jh = bh2jh(bh); J_ASSERT_JH(jh, jh->b_transaction == NULL); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); J_ASSERT_JH(jh, jh->b_jlist == BJ_None); J_ASSERT_BH(bh, buffer_jbd(bh)); J_ASSERT_BH(bh, jh2bh(jh) == bh); BUFFER_TRACE(bh, "remove journal_head"); /* Unlink before dropping the lock */ bh->b_private = NULL; jh->b_bh = NULL; /* debug, really */ clear_buffer_jbd(bh); } static void journal_release_journal_head(struct journal_head *jh, size_t b_size) { if (jh->b_frozen_data) { printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); jbd2_free(jh->b_frozen_data, b_size); } if (jh->b_committed_data) { printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); jbd2_free(jh->b_committed_data, b_size); } journal_free_journal_head(jh); } /* * Drop a reference on the passed journal_head. If it fell to zero then * release the journal_head from the buffer_head. */ void jbd2_journal_put_journal_head(struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); jbd_lock_bh_journal_head(bh); J_ASSERT_JH(jh, jh->b_jcount > 0); --jh->b_jcount; if (!jh->b_jcount) { __journal_remove_journal_head(bh); jbd_unlock_bh_journal_head(bh); journal_release_journal_head(jh, bh->b_size); __brelse(bh); } else { jbd_unlock_bh_journal_head(bh); } } EXPORT_SYMBOL(jbd2_journal_put_journal_head); /* * Initialize jbd inode head */ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) { jinode->i_transaction = NULL; jinode->i_next_transaction = NULL; jinode->i_vfs_inode = inode; jinode->i_flags = 0; jinode->i_dirty_start = 0; jinode->i_dirty_end = 0; INIT_LIST_HEAD(&jinode->i_list); } /* * Function to be called before we start removing inode from memory (i.e., * clear_inode() is a fine place to be called from). It removes inode from * transaction's lists. */ void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode) { if (!journal) return; restart: spin_lock(&journal->j_list_lock); /* Is commit writing out inode - we have to wait */ if (jinode->i_flags & JI_COMMIT_RUNNING) { wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&journal->j_list_lock); schedule(); finish_wait(wq, &wait.wq_entry); goto restart; } if (jinode->i_transaction) { list_del(&jinode->i_list); jinode->i_transaction = NULL; } spin_unlock(&journal->j_list_lock); } #ifdef CONFIG_PROC_FS #define JBD2_STATS_PROC_NAME "fs/jbd2" static void __init jbd2_create_jbd_stats_proc_entry(void) { proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL); } static void __exit jbd2_remove_jbd_stats_proc_entry(void) { if (proc_jbd2_stats) remove_proc_entry(JBD2_STATS_PROC_NAME, NULL); } #else #define jbd2_create_jbd_stats_proc_entry() do {} while (0) #define jbd2_remove_jbd_stats_proc_entry() do {} while (0) #endif struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache; static int __init jbd2_journal_init_inode_cache(void) { J_ASSERT(!jbd2_inode_cache); jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0); if (!jbd2_inode_cache) { pr_emerg("JBD2: failed to create inode cache\n"); return -ENOMEM; } return 0; } static int __init jbd2_journal_init_handle_cache(void) { J_ASSERT(!jbd2_handle_cache); jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY); if (!jbd2_handle_cache) { printk(KERN_EMERG "JBD2: failed to create handle cache\n"); return -ENOMEM; } return 0; } static void jbd2_journal_destroy_inode_cache(void) { kmem_cache_destroy(jbd2_inode_cache); jbd2_inode_cache = NULL; } static void jbd2_journal_destroy_handle_cache(void) { kmem_cache_destroy(jbd2_handle_cache); jbd2_handle_cache = NULL; } /* * Module startup and shutdown */ static int __init journal_init_caches(void) { int ret; ret = jbd2_journal_init_revoke_record_cache(); if (ret == 0) ret = jbd2_journal_init_revoke_table_cache(); if (ret == 0) ret = jbd2_journal_init_journal_head_cache(); if (ret == 0) ret = jbd2_journal_init_handle_cache(); if (ret == 0) ret = jbd2_journal_init_inode_cache(); if (ret == 0) ret = jbd2_journal_init_transaction_cache(); return ret; } static void jbd2_journal_destroy_caches(void) { jbd2_journal_destroy_revoke_record_cache(); jbd2_journal_destroy_revoke_table_cache(); jbd2_journal_destroy_journal_head_cache(); jbd2_journal_destroy_handle_cache(); jbd2_journal_destroy_inode_cache(); jbd2_journal_destroy_transaction_cache(); jbd2_journal_destroy_slabs(); } static int __init journal_init(void) { int ret; BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); ret = journal_init_caches(); if (ret == 0) { jbd2_create_jbd_stats_proc_entry(); } else { jbd2_journal_destroy_caches(); } return ret; } static void __exit journal_exit(void) { #ifdef CONFIG_JBD2_DEBUG int n = atomic_read(&nr_journal_heads); if (n) printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n); #endif jbd2_remove_jbd_stats_proc_entry(); jbd2_journal_destroy_caches(); } MODULE_LICENSE("GPL"); module_init(journal_init); module_exit(journal_exit);
1 1 11 1 1 1 1952 1952 1952 3 1 1 1 1 11 11 9 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ #include <linux/init.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <net/net_namespace.h> #include <linux/rtnetlink.h> #include <linux/if_arp.h> #include <linux/list.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/dma-mapping.h> #include <net/addrconf.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> #include <rdma/rdma_netlink.h> #include <linux/kthread.h> #include "siw.h" #include "siw_verbs.h" MODULE_AUTHOR("Bernard Metzler"); MODULE_DESCRIPTION("Software iWARP Driver"); MODULE_LICENSE("Dual BSD/GPL"); /* transmit from user buffer, if possible */ const bool zcopy_tx = true; /* Restrict usage of GSO, if hardware peer iwarp is unable to process * large packets. try_gso = true lets siw try to use local GSO, * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth. */ const bool try_gso; /* Attach siw also with loopback devices */ const bool loopback_enabled = true; /* We try to negotiate CRC on, if true */ const bool mpa_crc_required; /* MPA CRC on/off enforced */ const bool mpa_crc_strict; /* Control TCP_NODELAY socket option */ const bool siw_tcp_nagle; /* Select MPA version to be used during connection setup */ u_char mpa_version = MPA_REVISION_2; /* Selects MPA P2P mode (additional handshake during connection * setup, if true. */ const bool peer_to_peer; struct task_struct *siw_tx_thread[NR_CPUS]; struct crypto_shash *siw_crypto_shash; static int siw_device_register(struct siw_device *sdev, const char *name) { struct ib_device *base_dev = &sdev->base_dev; static int dev_id = 1; int rv; sdev->vendor_part_id = dev_id++; rv = ib_register_device(base_dev, name, NULL); if (rv) { pr_warn("siw: device registration error %d\n", rv); return rv; } siw_dbg(base_dev, "HWaddr=%pM\n", sdev->raw_gid); return 0; } static void siw_device_cleanup(struct ib_device *base_dev) { struct siw_device *sdev = to_siw_dev(base_dev); xa_destroy(&sdev->qp_xa); xa_destroy(&sdev->mem_xa); } static int siw_dev_qualified(struct net_device *netdev) { /* * Additional hardware support can be added here * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see * <linux/if_arp.h> for type identifiers. */ if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 || netdev->type == ARPHRD_NONE || (netdev->type == ARPHRD_LOOPBACK && loopback_enabled)) return 1; return 0; } static DEFINE_PER_CPU(atomic_t, siw_use_cnt); static struct { struct cpumask **tx_valid_cpus; int num_nodes; } siw_cpu_info; static void siw_destroy_cpulist(int number) { int i = 0; while (i < number) kfree(siw_cpu_info.tx_valid_cpus[i++]); kfree(siw_cpu_info.tx_valid_cpus); siw_cpu_info.tx_valid_cpus = NULL; } static int siw_init_cpulist(void) { int i, num_nodes = nr_node_ids; memset(siw_tx_thread, 0, sizeof(siw_tx_thread)); siw_cpu_info.num_nodes = num_nodes; siw_cpu_info.tx_valid_cpus = kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus) { siw_cpu_info.num_nodes = 0; return -ENOMEM; } for (i = 0; i < siw_cpu_info.num_nodes; i++) { siw_cpu_info.tx_valid_cpus[i] = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus[i]) goto out_err; cpumask_clear(siw_cpu_info.tx_valid_cpus[i]); } for_each_possible_cpu(i) cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]); return 0; out_err: siw_cpu_info.num_nodes = 0; siw_destroy_cpulist(i); return -ENOMEM; } /* * Choose CPU with least number of active QP's from NUMA node of * TX interface. */ int siw_get_tx_cpu(struct siw_device *sdev) { const struct cpumask *tx_cpumask; int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1; if (node < 0) tx_cpumask = cpu_online_mask; else tx_cpumask = siw_cpu_info.tx_valid_cpus[node]; num_cpus = cpumask_weight(tx_cpumask); if (!num_cpus) { /* no CPU on this NUMA node */ tx_cpumask = cpu_online_mask; num_cpus = cpumask_weight(tx_cpumask); } if (!num_cpus) goto out; cpu = cpumask_first(tx_cpumask); for (i = 0, min_use = SIW_MAX_QP; i < num_cpus; i++, cpu = cpumask_next(cpu, tx_cpumask)) { int usage; /* Skip any cores which have no TX thread */ if (!siw_tx_thread[cpu]) continue; usage = atomic_read(&per_cpu(siw_use_cnt, cpu)); if (usage <= min_use) { tx_cpu = cpu; min_use = usage; } } siw_dbg(&sdev->base_dev, "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use); out: if (tx_cpu >= 0) atomic_inc(&per_cpu(siw_use_cnt, tx_cpu)); else pr_warn("siw: no tx cpu found\n"); return tx_cpu; } void siw_put_tx_cpu(int cpu) { atomic_dec(&per_cpu(siw_use_cnt, cpu)); } static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id) { struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id); if (qp) { /* * siw_qp_id2obj() increments object reference count */ siw_qp_put(qp); return &qp->base_qp; } return NULL; } static const struct ib_device_ops siw_device_ops = { .owner = THIS_MODULE, .uverbs_abi_ver = SIW_ABI_VERSION, .driver_id = RDMA_DRIVER_SIW, .alloc_mr = siw_alloc_mr, .alloc_pd = siw_alloc_pd, .alloc_ucontext = siw_alloc_ucontext, .create_cq = siw_create_cq, .create_qp = siw_create_qp, .create_srq = siw_create_srq, .dealloc_driver = siw_device_cleanup, .dealloc_pd = siw_dealloc_pd, .dealloc_ucontext = siw_dealloc_ucontext, .dereg_mr = siw_dereg_mr, .destroy_cq = siw_destroy_cq, .destroy_qp = siw_destroy_qp, .destroy_srq = siw_destroy_srq, .get_dma_mr = siw_get_dma_mr, .get_port_immutable = siw_get_port_immutable, .iw_accept = siw_accept, .iw_add_ref = siw_qp_get_ref, .iw_connect = siw_connect, .iw_create_listen = siw_create_listen, .iw_destroy_listen = siw_destroy_listen, .iw_get_qp = siw_get_base_qp, .iw_reject = siw_reject, .iw_rem_ref = siw_qp_put_ref, .map_mr_sg = siw_map_mr_sg, .mmap = siw_mmap, .mmap_free = siw_mmap_free, .modify_qp = siw_verbs_modify_qp, .modify_srq = siw_modify_srq, .poll_cq = siw_poll_cq, .post_recv = siw_post_receive, .post_send = siw_post_send, .post_srq_recv = siw_post_srq_recv, .query_device = siw_query_device, .query_gid = siw_query_gid, .query_port = siw_query_port, .query_qp = siw_query_qp, .query_srq = siw_query_srq, .req_notify_cq = siw_req_notify_cq, .reg_user_mr = siw_reg_user_mr, INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq), INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd), INIT_RDMA_OBJ_SIZE(ib_qp, siw_qp, base_qp), INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq), INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext), }; static struct siw_device *siw_device_create(struct net_device *netdev) { struct siw_device *sdev = NULL; struct ib_device *base_dev; int rv; sdev = ib_alloc_device(siw_device, base_dev); if (!sdev) return NULL; base_dev = &sdev->base_dev; sdev->netdev = netdev; if (netdev->addr_len) { memcpy(sdev->raw_gid, netdev->dev_addr, min_t(unsigned int, netdev->addr_len, ETH_ALEN)); } else { /* * This device does not have a HW address, but * connection mangagement requires a unique gid. */ eth_random_addr(sdev->raw_gid); } addrconf_addr_eui48((u8 *)&base_dev->node_guid, sdev->raw_gid); base_dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND); base_dev->node_type = RDMA_NODE_RNIC; memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON, sizeof(SIW_NODE_DESC_COMMON)); /* * Current model (one-to-one device association): * One Softiwarp device per net_device or, equivalently, * per physical port. */ base_dev->phys_port_cnt = 1; base_dev->num_comp_vectors = num_possible_cpus(); xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1); xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1); ib_set_device_ops(base_dev, &siw_device_ops); rv = ib_device_set_netdev(base_dev, netdev, 1); if (rv) goto error; memcpy(base_dev->iw_ifname, netdev->name, sizeof(base_dev->iw_ifname)); /* Disable TCP port mapping */ base_dev->iw_driver_flags = IW_F_NO_PORT_MAP; sdev->attrs.max_qp = SIW_MAX_QP; sdev->attrs.max_qp_wr = SIW_MAX_QP_WR; sdev->attrs.max_ord = SIW_MAX_ORD_QP; sdev->attrs.max_ird = SIW_MAX_IRD_QP; sdev->attrs.max_sge = SIW_MAX_SGE; sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD; sdev->attrs.max_cq = SIW_MAX_CQ; sdev->attrs.max_cqe = SIW_MAX_CQE; sdev->attrs.max_mr = SIW_MAX_MR; sdev->attrs.max_pd = SIW_MAX_PD; sdev->attrs.max_mw = SIW_MAX_MW; sdev->attrs.max_srq = SIW_MAX_SRQ; sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR; sdev->attrs.max_srq_sge = SIW_MAX_SGE; INIT_LIST_HEAD(&sdev->cep_list); INIT_LIST_HEAD(&sdev->qp_list); atomic_set(&sdev->num_ctx, 0); atomic_set(&sdev->num_srq, 0); atomic_set(&sdev->num_qp, 0); atomic_set(&sdev->num_cq, 0); atomic_set(&sdev->num_mr, 0); atomic_set(&sdev->num_pd, 0); sdev->numa_node = dev_to_node(&netdev->dev); spin_lock_init(&sdev->lock); return sdev; error: ib_dealloc_device(base_dev); return NULL; } /* * Network link becomes unavailable. Mark all * affected QP's accordingly. */ static void siw_netdev_down(struct work_struct *work) { struct siw_device *sdev = container_of(work, struct siw_device, netdev_down); struct siw_qp_attrs qp_attrs; struct list_head *pos, *tmp; memset(&qp_attrs, 0, sizeof(qp_attrs)); qp_attrs.state = SIW_QP_STATE_ERROR; list_for_each_safe(pos, tmp, &sdev->qp_list) { struct siw_qp *qp = list_entry(pos, struct siw_qp, devq); down_write(&qp->state_lock); WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE)); up_write(&qp->state_lock); } ib_device_put(&sdev->base_dev); } static void siw_device_goes_down(struct siw_device *sdev) { if (ib_device_try_get(&sdev->base_dev)) { INIT_WORK(&sdev->netdev_down, siw_netdev_down); schedule_work(&sdev->netdev_down); } } static int siw_netdev_event(struct notifier_block *nb, unsigned long event, void *arg) { struct net_device *netdev = netdev_notifier_info_to_dev(arg); struct ib_device *base_dev; struct siw_device *sdev; dev_dbg(&netdev->dev, "siw: event %lu\n", event); base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (!base_dev) return NOTIFY_OK; sdev = to_siw_dev(base_dev); switch (event) { case NETDEV_UP: sdev->state = IB_PORT_ACTIVE; siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); break; case NETDEV_GOING_DOWN: siw_device_goes_down(sdev); break; case NETDEV_DOWN: sdev->state = IB_PORT_DOWN; siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); break; case NETDEV_REGISTER: /* * Device registration now handled only by * rdma netlink commands. So it shall be impossible * to end up here with a valid siw device. */ siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n"); break; case NETDEV_UNREGISTER: ib_unregister_device_queued(&sdev->base_dev); break; case NETDEV_CHANGEADDR: siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); break; /* * Todo: Below netdev events are currently not handled. */ case NETDEV_CHANGEMTU: case NETDEV_CHANGE: break; default: break; } ib_device_put(&sdev->base_dev); return NOTIFY_OK; } static struct notifier_block siw_netdev_nb = { .notifier_call = siw_netdev_event, }; static int siw_newlink(const char *basedev_name, struct net_device *netdev) { struct ib_device *base_dev; struct siw_device *sdev = NULL; int rv = -ENOMEM; if (!siw_dev_qualified(netdev)) return -EINVAL; base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (base_dev) { ib_device_put(base_dev); return -EEXIST; } sdev = siw_device_create(netdev); if (sdev) { dev_dbg(&netdev->dev, "siw: new device\n"); if (netif_running(netdev) && netif_carrier_ok(netdev)) sdev->state = IB_PORT_ACTIVE; else sdev->state = IB_PORT_DOWN; rv = siw_device_register(sdev, basedev_name); if (rv) ib_dealloc_device(&sdev->base_dev); } return rv; } static struct rdma_link_ops siw_link_ops = { .type = "siw", .newlink = siw_newlink, }; /* * siw_init_module - Initialize Softiwarp module and register with netdev * subsystem. */ static __init int siw_init_module(void) { int rv; if (SENDPAGE_THRESH < SIW_MAX_INLINE) { pr_info("siw: sendpage threshold too small: %u\n", (int)SENDPAGE_THRESH); rv = -EINVAL; goto out_error; } rv = siw_init_cpulist(); if (rv) goto out_error; rv = siw_cm_init(); if (rv) goto out_error; if (!siw_create_tx_threads()) { pr_info("siw: Could not start any TX thread\n"); rv = -ENOMEM; goto out_error; } /* * Locate CRC32 algorithm. If unsuccessful, fail * loading siw only, if CRC is required. */ siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(siw_crypto_shash)) { pr_info("siw: Loading CRC32c failed: %ld\n", PTR_ERR(siw_crypto_shash)); siw_crypto_shash = NULL; if (mpa_crc_required) { rv = -EOPNOTSUPP; goto out_error; } } rv = register_netdevice_notifier(&siw_netdev_nb); if (rv) goto out_error; rdma_link_register(&siw_link_ops); pr_info("SoftiWARP attached\n"); return 0; out_error: siw_stop_tx_threads(); if (siw_crypto_shash) crypto_free_shash(siw_crypto_shash); pr_info("SoftIWARP attach failed. Error: %d\n", rv); siw_cm_exit(); siw_destroy_cpulist(siw_cpu_info.num_nodes); return rv; } static void __exit siw_exit_module(void) { siw_stop_tx_threads(); unregister_netdevice_notifier(&siw_netdev_nb); rdma_link_unregister(&siw_link_ops); ib_unregister_driver(RDMA_DRIVER_SIW); siw_cm_exit(); siw_destroy_cpulist(siw_cpu_info.num_nodes); if (siw_crypto_shash) crypto_free_shash(siw_crypto_shash); pr_info("SoftiWARP detached\n"); } module_init(siw_init_module); module_exit(siw_exit_module); MODULE_ALIAS_RDMA_LINK("siw");
7 1 6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/netfilter/xt_devgroup.h> #include <linux/netfilter/x_tables.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: Device group match"); MODULE_ALIAS("ipt_devgroup"); MODULE_ALIAS("ip6t_devgroup"); static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_devgroup_info *info = par->matchinfo; if (info->flags & XT_DEVGROUP_MATCH_SRC && (((info->src_group ^ xt_in(par)->group) & info->src_mask ? 1 : 0) ^ ((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0))) return false; if (info->flags & XT_DEVGROUP_MATCH_DST && (((info->dst_group ^ xt_out(par)->group) & info->dst_mask ? 1 : 0) ^ ((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0))) return false; return true; } static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) { const struct xt_devgroup_info *info = par->matchinfo; if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC | XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST)) return -EINVAL; if (info->flags & XT_DEVGROUP_MATCH_SRC && par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD))) return -EINVAL; if (info->flags & XT_DEVGROUP_MATCH_DST && par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) return -EINVAL; return 0; } static struct xt_match devgroup_mt_reg __read_mostly = { .name = "devgroup", .match = devgroup_mt, .checkentry = devgroup_mt_checkentry, .matchsize = sizeof(struct xt_devgroup_info), .family = NFPROTO_UNSPEC, .me = THIS_MODULE }; static int __init devgroup_mt_init(void) { return xt_register_match(&devgroup_mt_reg); } static void __exit devgroup_mt_exit(void) { xt_unregister_match(&devgroup_mt_reg); } module_init(devgroup_mt_init); module_exit(devgroup_mt_exit);
13 13 27 26 26 16 12 3 4 11 7 20 17 3 14 6 6 4 4 6 6 6 4 6 4 4 27 26 26 27 27 22 8 1 2 1 4 6 6 4 395 394 397 395 27 13 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 // SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The IP fragmentation functionality. * * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox <alan@lxorguk.ukuu.org.uk> * * Fixes: * Alan Cox : Split from ip.c , see ip_input.c for history. * David S. Miller : Begin massive cleanup... * Andi Kleen : Add sysctls. * xxxx : Overlapfrag bug. * Ultima : ip_expire() kernel panic. * Bill Hawes : Frag accounting and evictor fixes. * John McDonald : 0 length frag bug. * Alexey Kuznetsov: SMP races, threading, cleanup. * Patrick McHardy : LRU queue of frag heads for evictor. */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/compiler.h> #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/jiffies.h> #include <linux/skbuff.h> #include <linux/list.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/netdevice.h> #include <linux/jhash.h> #include <linux/random.h> #include <linux/slab.h> #include <net/route.h> #include <net/dst.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/checksum.h> #include <net/inetpeer.h> #include <net/inet_frag.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/inet.h> #include <linux/netfilter_ipv4.h> #include <net/inet_ecn.h> #include <net/l3mdev.h> /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c * as well. Or notify me, at least. --ANK */ static const char ip_frag_cache_name[] = "ip4-frags"; /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; unsigned int rid; struct inet_peer *peer; }; static u8 ip4_frag_ecn(u8 tos) { return 1 << (tos & INET_ECN_MASK); } static struct inet_frags ip4_frags; static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev); static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); struct net *net = q->fqdir->net; const struct frag_v4_compare_key *key = a; q->key.v4 = *key; qp->ecn = 0; qp->peer = q->fqdir->max_dist ? inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : NULL; } static void ip4_frag_free(struct inet_frag_queue *q) { struct ipq *qp; qp = container_of(q, struct ipq, q); if (qp->peer) inet_putpeer(qp->peer); } /* Destruction primitives. */ static void ipq_put(struct ipq *ipq) { inet_frag_put(&ipq->q); } /* Kill ipq entry. It is not destroyed immediately, * because caller (and someone more) holds reference count. */ static void ipq_kill(struct ipq *ipq) { inet_frag_kill(&ipq->q); } static bool frag_expire_skip_icmp(u32 user) { return user == IP_DEFRAG_AF_PACKET || ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN, __IP_DEFRAG_CONNTRACK_IN_END) || ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN, __IP_DEFRAG_CONNTRACK_BRIDGE_IN); } /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ static void ip_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); const struct iphdr *iph; struct sk_buff *head = NULL; struct net *net; struct ipq *qp; int err; qp = container_of(frag, struct ipq, q); net = qp->q.fqdir->net; rcu_read_lock(); /* Paired with WRITE_ONCE() in fqdir_pre_exit(). */ if (READ_ONCE(qp->q.fqdir->dead)) goto out_rcu_unlock; spin_lock(&qp->q.lock); if (qp->q.flags & INET_FRAG_COMPLETE) goto out; qp->q.flags |= INET_FRAG_DROP; ipq_kill(qp); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); if (!(qp->q.flags & INET_FRAG_FIRST_IN)) goto out; /* sk_buff::dev and sk_buff::rbnode are unionized. So we * pull the head out of the tree in order to be able to * deal with head->dev. */ head = inet_frag_pull_head(&qp->q); if (!head) goto out; head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out; /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); err = ip_route_input_noref(head, iph->daddr, iph->saddr, iph->tos, head->dev); if (err) goto out; /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ if (frag_expire_skip_icmp(qp->q.key.v4.user) && (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; spin_unlock(&qp->q.lock); icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); goto out_rcu_unlock; out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT); ipq_put(qp); } /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user, int vif) { struct frag_v4_compare_key key = { .saddr = iph->saddr, .daddr = iph->daddr, .user = user, .vif = vif, .id = iph->id, .protocol = iph->protocol, }; struct inet_frag_queue *q; q = inet_frag_find(net->ipv4.fqdir, &key); if (!q) return NULL; return container_of(q, struct ipq, q); } /* Is the fragment too far ahead to be part of ipq? */ static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; unsigned int max = qp->q.fqdir->max_dist; unsigned int start, end; int rc; if (!peer || !max) return 0; start = qp->rid; end = atomic_inc_return(&peer->rid); qp->rid = end; rc = qp->q.fragments_tail && (end - start) > max; if (rc) __IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS); return rc; } static int ip_frag_reinit(struct ipq *qp) { unsigned int sum_truesize = 0; if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) { refcount_inc(&qp->q.refcnt); return -ETIMEDOUT; } sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments, SKB_DROP_REASON_FRAG_TOO_FAR); sub_frag_mem_limit(qp->q.fqdir, sum_truesize); qp->q.flags = 0; qp->q.len = 0; qp->q.meat = 0; qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; qp->q.last_run_head = NULL; qp->iif = 0; qp->ecn = 0; return 0; } /* Add new segment to existing queue. */ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct net *net = qp->q.fqdir->net; int ihl, end, flags, offset; struct sk_buff *prev_tail; struct net_device *dev; unsigned int fragsize; int err = -ENOENT; SKB_DR(reason); u8 ecn; /* If reassembly is already done, @skb must be a duplicate frag. */ if (qp->q.flags & INET_FRAG_COMPLETE) { SKB_DR_SET(reason, DUP_FRAG); goto err; } if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && unlikely(ip_frag_too_far(qp)) && unlikely(err = ip_frag_reinit(qp))) { ipq_kill(qp); goto err; } ecn = ip4_frag_ecn(ip_hdr(skb)->tos); offset = ntohs(ip_hdr(skb)->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; offset <<= 3; /* offset is in 8-byte chunks */ ihl = ip_hdrlen(skb); /* Determine the position of this fragment. */ end = offset + skb->len - skb_network_offset(skb) - ihl; err = -EINVAL; /* Is this the final fragment? */ if ((flags & IP_MF) == 0) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. */ if (end < qp->q.len || ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len)) goto discard_qp; qp->q.flags |= INET_FRAG_LAST_IN; qp->q.len = end; } else { if (end&7) { end &= ~7; if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->ip_summed = CHECKSUM_NONE; } if (end > qp->q.len) { /* Some bits beyond end -> corruption. */ if (qp->q.flags & INET_FRAG_LAST_IN) goto discard_qp; qp->q.len = end; } } if (end == offset) goto discard_qp; err = -ENOMEM; if (!pskb_pull(skb, skb_network_offset(skb) + ihl)) goto discard_qp; err = pskb_trim_rcsum(skb, end - offset); if (err) goto discard_qp; /* Note : skb->rbnode and skb->dev share the same location. */ dev = skb->dev; /* Makes sure compiler wont do silly aliasing games */ barrier(); prev_tail = qp->q.fragments_tail; err = inet_frag_queue_insert(&qp->q, skb, offset, end); if (err) goto insert_error; if (dev) qp->iif = dev->ifindex; qp->q.stamp = skb->tstamp; qp->q.mono_delivery_time = skb->mono_delivery_time; qp->q.meat += skb->len; qp->ecn |= ecn; add_frag_mem_limit(qp->q.fqdir, skb->truesize); if (offset == 0) qp->q.flags |= INET_FRAG_FIRST_IN; fragsize = skb->len + ihl; if (fragsize > qp->q.max_size) qp->q.max_size = fragsize; if (ip_hdr(skb)->frag_off & htons(IP_DF) && fragsize > qp->max_df_size) qp->max_df_size = fragsize; if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && qp->q.meat == qp->q.len) { unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; err = ip_frag_reasm(qp, skb, prev_tail, dev); skb->_skb_refdst = orefdst; if (err) inet_frag_kill(&qp->q); return err; } skb_dst_drop(skb); return -EINPROGRESS; insert_error: if (err == IPFRAG_DUP) { SKB_DR_SET(reason, DUP_FRAG); err = -EINVAL; goto err; } err = -EINVAL; __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); discard_qp: inet_frag_kill(&qp->q); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); err: kfree_skb_reason(skb, reason); return err; } static bool ip_frag_coalesce_ok(const struct ipq *qp) { return qp->q.key.v4.user == IP_DEFRAG_LOCAL_DELIVER; } /* Build a new IP datagram from all its fragments. */ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { struct net *net = qp->q.fqdir->net; struct iphdr *iph; void *reasm_data; int len, err; u8 ecn; ipq_kill(qp); ecn = ip_frag_ecn_table[qp->ecn]; if (unlikely(ecn == 0xff)) { err = -EINVAL; goto out_fail; } /* Make the one we just received the head. */ reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail); if (!reasm_data) goto out_nomem; len = ip_hdrlen(skb) + qp->q.len; err = -E2BIG; if (len > 65535) goto out_oversize; inet_frag_reasm_finish(&qp->q, skb, reasm_data, ip_frag_coalesce_ok(qp)); skb->dev = dev; IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size); iph = ip_hdr(skb); iph->tot_len = htons(len); iph->tos |= ecn; /* When we set IP_DF on a refragmented skb we must also force a * call to ip_fragment to avoid forwarding a DF-skb of size s while * original sender only sent fragments of size f (where f < s). * * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest * frag seen to avoid sending tiny DF-fragments in case skb was built * from one very small df-fragment and one large non-df frag. */ if (qp->max_df_size == qp->q.max_size) { IPCB(skb)->flags |= IPSKB_FRAG_PMTU; iph->frag_off = htons(IP_DF); } else { iph->frag_off = 0; } ip_send_check(iph); __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; qp->q.last_run_head = NULL; return 0; out_nomem: net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp); err = -ENOMEM; goto out_fail; out_oversize: net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr); out_fail: __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); return err; } /* Process an incoming IP datagram fragment. */ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; int vif = l3mdev_master_ifindex_rcu(dev); struct ipq *qp; __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); skb_orphan(skb); /* Lookup (or create) queue header */ qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) { int ret; spin_lock(&qp->q.lock); ret = ip_frag_queue(qp, skb); spin_unlock(&qp->q.lock); ipq_put(qp); return ret; } __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -ENOMEM; } EXPORT_SYMBOL(ip_defrag); struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct iphdr iph; int netoff; u32 len; if (skb->protocol != htons(ETH_P_IP)) return skb; netoff = skb_network_offset(skb); if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0) return skb; if (iph.ihl < 5 || iph.version != 4) return skb; len = ntohs(iph.tot_len); if (skb->len < netoff + len || len < (iph.ihl * 4)) return skb; if (ip_is_fragment(&iph)) { skb = skb_share_check(skb, GFP_ATOMIC); if (skb) { if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) { kfree_skb(skb); return NULL; } if (pskb_trim_rcsum(skb, netoff + len)) { kfree_skb(skb); return NULL; } memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(net, skb, user)) return NULL; skb_clear_hash(skb); } } return skb; } EXPORT_SYMBOL(ip_check_defrag); #ifdef CONFIG_SYSCTL static int dist_min; static struct ctl_table ip4_frags_ns_ctl_table[] = { { .procname = "ipfrag_high_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "ipfrag_low_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "ipfrag_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ipfrag_max_dist", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &dist_min, }, { } }; /* secret interval has been deprecated */ static int ip4_frags_secret_interval_unused; static struct ctl_table ip4_frags_ctl_table[] = { { .procname = "ipfrag_secret_interval", .data = &ip4_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { } }; static int __net_init ip4_frags_ns_ctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; table = ip4_frags_ns_ctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); if (!table) goto err_alloc; } table[0].data = &net->ipv4.fqdir->high_thresh; table[0].extra1 = &net->ipv4.fqdir->low_thresh; table[1].data = &net->ipv4.fqdir->low_thresh; table[1].extra2 = &net->ipv4.fqdir->high_thresh; table[2].data = &net->ipv4.fqdir->timeout; table[3].data = &net->ipv4.fqdir->max_dist; hdr = register_net_sysctl_sz(net, "net/ipv4", table, ARRAY_SIZE(ip4_frags_ns_ctl_table)); if (!hdr) goto err_reg; net->ipv4.frags_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) { struct ctl_table *table; table = net->ipv4.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.frags_hdr); kfree(table); } static void __init ip4_frags_ctl_register(void) { register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); } #else static int ip4_frags_ns_ctl_register(struct net *net) { return 0; } static void ip4_frags_ns_ctl_unregister(struct net *net) { } static void __init ip4_frags_ctl_register(void) { } #endif static int __net_init ipv4_frags_init_net(struct net *net) { int res; res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net); if (res < 0) return res; /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for * the real memory usage, by measuring both the size of frag * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue)) * and the SKB's truesize. * * A 64K fragment consumes 129736 bytes (44*2944)+200 * (1500 truesize == 2944, sizeof(struct ipq) == 200) * * We will commit 4MB at one time. Should we cross that limit * we will prune down to 3MB, making room for approx 8 big 64K * fragments 8x128k. */ net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024; net->ipv4.fqdir->low_thresh = 3 * 1024 * 1024; /* * Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival * by TTL. */ net->ipv4.fqdir->timeout = IP_FRAG_TIME; net->ipv4.fqdir->max_dist = 64; res = ip4_frags_ns_ctl_register(net); if (res < 0) fqdir_exit(net->ipv4.fqdir); return res; } static void __net_exit ipv4_frags_pre_exit_net(struct net *net) { fqdir_pre_exit(net->ipv4.fqdir); } static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); fqdir_exit(net->ipv4.fqdir); } static struct pernet_operations ip4_frags_ops = { .init = ipv4_frags_init_net, .pre_exit = ipv4_frags_pre_exit_net, .exit = ipv4_frags_exit_net, }; static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed) { return jhash2(data, sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); } static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed) { const struct inet_frag_queue *fq = data; return jhash2((const u32 *)&fq->key.v4, sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); } static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) { const struct frag_v4_compare_key *key = arg->key; const struct inet_frag_queue *fq = ptr; return !!memcmp(&fq->key, key, sizeof(*key)); } static const struct rhashtable_params ip4_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), .key_offset = offsetof(struct inet_frag_queue, key), .key_len = sizeof(struct frag_v4_compare_key), .hashfn = ip4_key_hashfn, .obj_hashfn = ip4_obj_hashfn, .obj_cmpfn = ip4_obj_cmpfn, .automatic_shrinking = true, }; void __init ipfrag_init(void) { ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; ip4_frags.qsize = sizeof(struct ipq); ip4_frags.frag_expire = ip_expire; ip4_frags.frags_cache_name = ip_frag_cache_name; ip4_frags.rhash_params = ip4_rhash_params; if (inet_frags_init(&ip4_frags)) panic("IP: failed to allocate ip4_frags cache\n"); ip4_frags_ctl_register(); register_pernet_subsys(&ip4_frags_ops); }
14 1 13 2 11 1 12 11 12 11 1 9 1 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 /* * Copyright (c) 2016 Intel Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #include <linux/uaccess.h> #include <drm/drm_drv.h> #include <drm/drm_encoder.h> #include <drm/drm_file.h> #include <drm/drm_framebuffer.h> #include <drm/drm_managed.h> #include <drm/drm_mode_config.h> #include <drm/drm_print.h> #include <linux/dma-resv.h> #include "drm_crtc_internal.h" #include "drm_internal.h" int drm_modeset_register_all(struct drm_device *dev) { int ret; ret = drm_plane_register_all(dev); if (ret) goto err_plane; ret = drm_crtc_register_all(dev); if (ret) goto err_crtc; ret = drm_encoder_register_all(dev); if (ret) goto err_encoder; ret = drm_connector_register_all(dev); if (ret) goto err_connector; return 0; err_connector: drm_encoder_unregister_all(dev); err_encoder: drm_crtc_unregister_all(dev); err_crtc: drm_plane_unregister_all(dev); err_plane: return ret; } void drm_modeset_unregister_all(struct drm_device *dev) { drm_connector_unregister_all(dev); drm_encoder_unregister_all(dev); drm_crtc_unregister_all(dev); drm_plane_unregister_all(dev); } /** * drm_mode_getresources - get graphics configuration * @dev: drm device for the ioctl * @data: data pointer for the ioctl * @file_priv: drm file for the ioctl call * * Construct a set of configuration description structures and return * them to the user, including CRTC, connector and framebuffer configuration. * * Called by the user via ioctl. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_getresources(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_card_res *card_res = data; struct drm_framebuffer *fb; struct drm_connector *connector; struct drm_crtc *crtc; struct drm_encoder *encoder; int count, ret = 0; uint32_t __user *fb_id; uint32_t __user *crtc_id; uint32_t __user *connector_id; uint32_t __user *encoder_id; struct drm_connector_list_iter conn_iter; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; mutex_lock(&file_priv->fbs_lock); count = 0; fb_id = u64_to_user_ptr(card_res->fb_id_ptr); list_for_each_entry(fb, &file_priv->fbs, filp_head) { if (count < card_res->count_fbs && put_user(fb->base.id, fb_id + count)) { mutex_unlock(&file_priv->fbs_lock); return -EFAULT; } count++; } card_res->count_fbs = count; mutex_unlock(&file_priv->fbs_lock); card_res->max_height = dev->mode_config.max_height; card_res->min_height = dev->mode_config.min_height; card_res->max_width = dev->mode_config.max_width; card_res->min_width = dev->mode_config.min_width; count = 0; crtc_id = u64_to_user_ptr(card_res->crtc_id_ptr); drm_for_each_crtc(crtc, dev) { if (drm_lease_held(file_priv, crtc->base.id)) { if (count < card_res->count_crtcs && put_user(crtc->base.id, crtc_id + count)) return -EFAULT; count++; } } card_res->count_crtcs = count; count = 0; encoder_id = u64_to_user_ptr(card_res->encoder_id_ptr); drm_for_each_encoder(encoder, dev) { if (count < card_res->count_encoders && put_user(encoder->base.id, encoder_id + count)) return -EFAULT; count++; } card_res->count_encoders = count; drm_connector_list_iter_begin(dev, &conn_iter); count = 0; connector_id = u64_to_user_ptr(card_res->connector_id_ptr); drm_for_each_connector_iter(connector, &conn_iter) { /* only expose writeback connectors if userspace understands them */ if (!file_priv->writeback_connectors && (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK)) continue; if (drm_lease_held(file_priv, connector->base.id)) { if (count < card_res->count_connectors && put_user(connector->base.id, connector_id + count)) { drm_connector_list_iter_end(&conn_iter); return -EFAULT; } count++; } } card_res->count_connectors = count; drm_connector_list_iter_end(&conn_iter); return ret; } /** * drm_mode_config_reset - call ->reset callbacks * @dev: drm device * * This functions calls all the crtc's, encoder's and connector's ->reset * callback. Drivers can use this in e.g. their driver load or resume code to * reset hardware and software state. */ void drm_mode_config_reset(struct drm_device *dev) { struct drm_crtc *crtc; struct drm_plane *plane; struct drm_encoder *encoder; struct drm_connector *connector; struct drm_connector_list_iter conn_iter; drm_for_each_plane(plane, dev) if (plane->funcs->reset) plane->funcs->reset(plane); drm_for_each_crtc(crtc, dev) if (crtc->funcs->reset) crtc->funcs->reset(crtc); drm_for_each_encoder(encoder, dev) if (encoder->funcs && encoder->funcs->reset) encoder->funcs->reset(encoder); drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) if (connector->funcs->reset) connector->funcs->reset(connector); drm_connector_list_iter_end(&conn_iter); } EXPORT_SYMBOL(drm_mode_config_reset); /* * Global properties */ static const struct drm_prop_enum_list drm_plane_type_enum_list[] = { { DRM_PLANE_TYPE_OVERLAY, "Overlay" }, { DRM_PLANE_TYPE_PRIMARY, "Primary" }, { DRM_PLANE_TYPE_CURSOR, "Cursor" }, }; static int drm_mode_create_standard_properties(struct drm_device *dev) { struct drm_property *prop; int ret; ret = drm_connector_create_standard_properties(dev); if (ret) return ret; prop = drm_property_create_enum(dev, DRM_MODE_PROP_IMMUTABLE, "type", drm_plane_type_enum_list, ARRAY_SIZE(drm_plane_type_enum_list)); if (!prop) return -ENOMEM; dev->mode_config.plane_type_property = prop; prop = drm_property_create_range(dev, DRM_MODE_PROP_ATOMIC, "SRC_X", 0, UINT_MAX); if (!prop) return -ENOMEM; dev->mode_config.prop_src_x = prop; prop = drm_property_create_range(dev, DRM_MODE_PROP_ATOMIC, "SRC_Y", 0, UINT_MAX); if (!prop) return -ENOMEM; dev->mode_config.prop_src_y = prop; prop = drm_property_create_range(dev, DRM_MODE_PROP_ATOMIC, "SRC_W", 0, UINT_MAX); if (!prop) return -ENOMEM; dev->mode_config.prop_src_w = prop; prop = drm_property_create_range(dev, DRM_MODE_PROP_ATOMIC, "SRC_H", 0, UINT_MAX); if (!prop) return -ENOMEM; dev->mode_config.prop_src_h = prop; prop = drm_property_create_signed_range(dev, DRM_MODE_PROP_ATOMIC, "CRTC_X", INT_MIN, INT_MAX); if (!prop) return -ENOMEM; dev->mode_config.prop_crtc_x = prop; prop = drm_property_create_signed_range(dev, DRM_MODE_PROP_ATOMIC, "CRTC_Y", INT_MIN, INT_MAX); if (!prop) return -ENOMEM; dev->mode_config.prop_crtc_y = prop; prop = drm_property_create_range(dev, DRM_MODE_PROP_ATOMIC, "CRTC_W", 0, INT_MAX); if (!prop) return -ENOMEM; dev->mode_config.prop_crtc_w = prop; prop = drm_property_create_range(dev, DRM_MODE_PROP_ATOMIC, "CRTC_H", 0, INT_MAX); if (!prop) return -ENOMEM; dev->mode_config.prop_crtc_h = prop; prop = drm_property_create_object(dev, DRM_MODE_PROP_ATOMIC, "FB_ID", DRM_MODE_OBJECT_FB); if (!prop) return -ENOMEM; dev->mode_config.prop_fb_id = prop; prop = drm_property_create_signed_range(dev, DRM_MODE_PROP_ATOMIC, "IN_FENCE_FD", -1, INT_MAX); if (!prop) return -ENOMEM; dev->mode_config.prop_in_fence_fd = prop; prop = drm_property_create_range(dev, DRM_MODE_PROP_ATOMIC, "OUT_FENCE_PTR", 0, U64_MAX); if (!prop) return -ENOMEM; dev->mode_config.prop_out_fence_ptr = prop; prop = drm_property_create_object(dev, DRM_MODE_PROP_ATOMIC, "CRTC_ID", DRM_MODE_OBJECT_CRTC); if (!prop) return -ENOMEM; dev->mode_config.prop_crtc_id = prop; prop = drm_property_create(dev, DRM_MODE_PROP_ATOMIC | DRM_MODE_PROP_BLOB, "FB_DAMAGE_CLIPS", 0); if (!prop) return -ENOMEM; dev->mode_config.prop_fb_damage_clips = prop; prop = drm_property_create_bool(dev, DRM_MODE_PROP_ATOMIC, "ACTIVE"); if (!prop) return -ENOMEM; dev->mode_config.prop_active = prop; prop = drm_property_create(dev, DRM_MODE_PROP_ATOMIC | DRM_MODE_PROP_BLOB, "MODE_ID", 0); if (!prop) return -ENOMEM; dev->mode_config.prop_mode_id = prop; prop = drm_property_create_bool(dev, 0, "VRR_ENABLED"); if (!prop) return -ENOMEM; dev->mode_config.prop_vrr_enabled = prop; prop = drm_property_create(dev, DRM_MODE_PROP_BLOB, "DEGAMMA_LUT", 0); if (!prop) return -ENOMEM; dev->mode_config.degamma_lut_property = prop; prop = drm_property_create_range(dev, DRM_MODE_PROP_IMMUTABLE, "DEGAMMA_LUT_SIZE", 0, UINT_MAX); if (!prop) return -ENOMEM; dev->mode_config.degamma_lut_size_property = prop; prop = drm_property_create(dev, DRM_MODE_PROP_BLOB, "CTM", 0); if (!prop) return -ENOMEM; dev->mode_config.ctm_property = prop; prop = drm_property_create(dev, DRM_MODE_PROP_BLOB, "GAMMA_LUT", 0); if (!prop) return -ENOMEM; dev->mode_config.gamma_lut_property = prop; prop = drm_property_create_range(dev, DRM_MODE_PROP_IMMUTABLE, "GAMMA_LUT_SIZE", 0, UINT_MAX); if (!prop) return -ENOMEM; dev->mode_config.gamma_lut_size_property = prop; prop = drm_property_create(dev, DRM_MODE_PROP_IMMUTABLE | DRM_MODE_PROP_BLOB, "IN_FORMATS", 0); if (!prop) return -ENOMEM; dev->mode_config.modifiers_property = prop; return 0; } static void drm_mode_config_init_release(struct drm_device *dev, void *ptr) { drm_mode_config_cleanup(dev); } /** * drmm_mode_config_init - managed DRM mode_configuration structure * initialization * @dev: DRM device * * Initialize @dev's mode_config structure, used for tracking the graphics * configuration of @dev. * * Since this initializes the modeset locks, no locking is possible. Which is no * problem, since this should happen single threaded at init time. It is the * driver's problem to ensure this guarantee. * * Cleanup is automatically handled through registering drm_mode_config_cleanup * with drmm_add_action(). * * Returns: 0 on success, negative error value on failure. */ int drmm_mode_config_init(struct drm_device *dev) { int ret; mutex_init(&dev->mode_config.mutex); drm_modeset_lock_init(&dev->mode_config.connection_mutex); mutex_init(&dev->mode_config.idr_mutex); mutex_init(&dev->mode_config.fb_lock); mutex_init(&dev->mode_config.blob_lock); INIT_LIST_HEAD(&dev->mode_config.fb_list); INIT_LIST_HEAD(&dev->mode_config.crtc_list); INIT_LIST_HEAD(&dev->mode_config.connector_list); INIT_LIST_HEAD(&dev->mode_config.encoder_list); INIT_LIST_HEAD(&dev->mode_config.property_list); INIT_LIST_HEAD(&dev->mode_config.property_blob_list); INIT_LIST_HEAD(&dev->mode_config.plane_list); INIT_LIST_HEAD(&dev->mode_config.privobj_list); idr_init_base(&dev->mode_config.object_idr, 1); idr_init_base(&dev->mode_config.tile_idr, 1); ida_init(&dev->mode_config.connector_ida); spin_lock_init(&dev->mode_config.connector_list_lock); init_llist_head(&dev->mode_config.connector_free_list); INIT_WORK(&dev->mode_config.connector_free_work, drm_connector_free_work_fn); ret = drm_mode_create_standard_properties(dev); if (ret) { drm_mode_config_cleanup(dev); return ret; } /* Just to be sure */ dev->mode_config.num_fb = 0; dev->mode_config.num_connector = 0; dev->mode_config.num_crtc = 0; dev->mode_config.num_encoder = 0; dev->mode_config.num_total_plane = 0; if (IS_ENABLED(CONFIG_LOCKDEP)) { struct drm_modeset_acquire_ctx modeset_ctx; struct ww_acquire_ctx resv_ctx; struct dma_resv resv; int ret; dma_resv_init(&resv); drm_modeset_acquire_init(&modeset_ctx, 0); ret = drm_modeset_lock(&dev->mode_config.connection_mutex, &modeset_ctx); if (ret == -EDEADLK) ret = drm_modeset_backoff(&modeset_ctx); ww_acquire_init(&resv_ctx, &reservation_ww_class); ret = dma_resv_lock(&resv, &resv_ctx); if (ret == -EDEADLK) dma_resv_lock_slow(&resv, &resv_ctx); dma_resv_unlock(&resv); ww_acquire_fini(&resv_ctx); drm_modeset_drop_locks(&modeset_ctx); drm_modeset_acquire_fini(&modeset_ctx); dma_resv_fini(&resv); } return drmm_add_action_or_reset(dev, drm_mode_config_init_release, NULL); } EXPORT_SYMBOL(drmm_mode_config_init); /** * drm_mode_config_cleanup - free up DRM mode_config info * @dev: DRM device * * Free up all the connectors and CRTCs associated with this DRM device, then * free up the framebuffers and associated buffer objects. * * Note that since this /should/ happen single-threaded at driver/device * teardown time, no locking is required. It's the driver's job to ensure that * this guarantee actually holds true. * * FIXME: With the managed drmm_mode_config_init() it is no longer necessary for * drivers to explicitly call this function. */ void drm_mode_config_cleanup(struct drm_device *dev) { struct drm_connector *connector; struct drm_connector_list_iter conn_iter; struct drm_crtc *crtc, *ct; struct drm_encoder *encoder, *enct; struct drm_framebuffer *fb, *fbt; struct drm_property *property, *pt; struct drm_property_blob *blob, *bt; struct drm_plane *plane, *plt; list_for_each_entry_safe(encoder, enct, &dev->mode_config.encoder_list, head) { encoder->funcs->destroy(encoder); } drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { /* drm_connector_list_iter holds an full reference to the * current connector itself, which means it is inherently safe * against unreferencing the current connector - but not against * deleting it right away. */ drm_connector_put(connector); } drm_connector_list_iter_end(&conn_iter); /* connector_iter drops references in a work item. */ flush_work(&dev->mode_config.connector_free_work); if (WARN_ON(!list_empty(&dev->mode_config.connector_list))) { drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) DRM_ERROR("connector %s leaked!\n", connector->name); drm_connector_list_iter_end(&conn_iter); } list_for_each_entry_safe(property, pt, &dev->mode_config.property_list, head) { drm_property_destroy(dev, property); } list_for_each_entry_safe(plane, plt, &dev->mode_config.plane_list, head) { plane->funcs->destroy(plane); } list_for_each_entry_safe(crtc, ct, &dev->mode_config.crtc_list, head) { crtc->funcs->destroy(crtc); } list_for_each_entry_safe(blob, bt, &dev->mode_config.property_blob_list, head_global) { drm_property_blob_put(blob); } /* * Single-threaded teardown context, so it's not required to grab the * fb_lock to protect against concurrent fb_list access. Contrary, it * would actually deadlock with the drm_framebuffer_cleanup function. * * Also, if there are any framebuffers left, that's a driver leak now, * so politely WARN about this. */ WARN_ON(!list_empty(&dev->mode_config.fb_list)); list_for_each_entry_safe(fb, fbt, &dev->mode_config.fb_list, head) { struct drm_printer p = drm_debug_printer("[leaked fb]"); drm_printf(&p, "framebuffer[%u]:\n", fb->base.id); drm_framebuffer_print_info(&p, 1, fb); drm_framebuffer_free(&fb->base.refcount); } ida_destroy(&dev->mode_config.connector_ida); idr_destroy(&dev->mode_config.tile_idr); idr_destroy(&dev->mode_config.object_idr); drm_modeset_lock_fini(&dev->mode_config.connection_mutex); } EXPORT_SYMBOL(drm_mode_config_cleanup); static u32 full_encoder_mask(struct drm_device *dev) { struct drm_encoder *encoder; u32 encoder_mask = 0; drm_for_each_encoder(encoder, dev) encoder_mask |= drm_encoder_mask(encoder); return encoder_mask; } /* * For some reason we want the encoder itself included in * possible_clones. Make life easy for drivers by allowing them * to leave possible_clones unset if no cloning is possible. */ static void fixup_encoder_possible_clones(struct drm_encoder *encoder) { if (encoder->possible_clones == 0) encoder->possible_clones = drm_encoder_mask(encoder); } static void validate_encoder_possible_clones(struct drm_encoder *encoder) { struct drm_device *dev = encoder->dev; u32 encoder_mask = full_encoder_mask(dev); struct drm_encoder *other; drm_for_each_encoder(other, dev) { WARN(!!(encoder->possible_clones & drm_encoder_mask(other)) != !!(other->possible_clones & drm_encoder_mask(encoder)), "possible_clones mismatch: " "[ENCODER:%d:%s] mask=0x%x possible_clones=0x%x vs. " "[ENCODER:%d:%s] mask=0x%x possible_clones=0x%x\n", encoder->base.id, encoder->name, drm_encoder_mask(encoder), encoder->possible_clones, other->base.id, other->name, drm_encoder_mask(other), other->possible_clones); } WARN((encoder->possible_clones & drm_encoder_mask(encoder)) == 0 || (encoder->possible_clones & ~encoder_mask) != 0, "Bogus possible_clones: " "[ENCODER:%d:%s] possible_clones=0x%x (full encoder mask=0x%x)\n", encoder->base.id, encoder->name, encoder->possible_clones, encoder_mask); } static u32 full_crtc_mask(struct drm_device *dev) { struct drm_crtc *crtc; u32 crtc_mask = 0; drm_for_each_crtc(crtc, dev) crtc_mask |= drm_crtc_mask(crtc); return crtc_mask; } static void validate_encoder_possible_crtcs(struct drm_encoder *encoder) { u32 crtc_mask = full_crtc_mask(encoder->dev); WARN((encoder->possible_crtcs & crtc_mask) == 0 || (encoder->possible_crtcs & ~crtc_mask) != 0, "Bogus possible_crtcs: " "[ENCODER:%d:%s] possible_crtcs=0x%x (full crtc mask=0x%x)\n", encoder->base.id, encoder->name, encoder->possible_crtcs, crtc_mask); } void drm_mode_config_validate(struct drm_device *dev) { struct drm_encoder *encoder; struct drm_crtc *crtc; struct drm_plane *plane; u32 primary_with_crtc = 0, cursor_with_crtc = 0; unsigned int num_primary = 0; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return; drm_for_each_encoder(encoder, dev) fixup_encoder_possible_clones(encoder); drm_for_each_encoder(encoder, dev) { validate_encoder_possible_clones(encoder); validate_encoder_possible_crtcs(encoder); } drm_for_each_crtc(crtc, dev) { WARN(!crtc->primary, "Missing primary plane on [CRTC:%d:%s]\n", crtc->base.id, crtc->name); WARN(crtc->cursor && crtc->funcs->cursor_set, "[CRTC:%d:%s] must not have both a cursor plane and a cursor_set func", crtc->base.id, crtc->name); WARN(crtc->cursor && crtc->funcs->cursor_set2, "[CRTC:%d:%s] must not have both a cursor plane and a cursor_set2 func", crtc->base.id, crtc->name); WARN(crtc->cursor && crtc->funcs->cursor_move, "[CRTC:%d:%s] must not have both a cursor plane and a cursor_move func", crtc->base.id, crtc->name); if (crtc->primary) { WARN(!(crtc->primary->possible_crtcs & drm_crtc_mask(crtc)), "Bogus primary plane possible_crtcs: [PLANE:%d:%s] must be compatible with [CRTC:%d:%s]\n", crtc->primary->base.id, crtc->primary->name, crtc->base.id, crtc->name); WARN(primary_with_crtc & drm_plane_mask(crtc->primary), "Primary plane [PLANE:%d:%s] used for multiple CRTCs", crtc->primary->base.id, crtc->primary->name); primary_with_crtc |= drm_plane_mask(crtc->primary); } if (crtc->cursor) { WARN(!(crtc->cursor->possible_crtcs & drm_crtc_mask(crtc)), "Bogus cursor plane possible_crtcs: [PLANE:%d:%s] must be compatible with [CRTC:%d:%s]\n", crtc->cursor->base.id, crtc->cursor->name, crtc->base.id, crtc->name); WARN(cursor_with_crtc & drm_plane_mask(crtc->cursor), "Cursor plane [PLANE:%d:%s] used for multiple CRTCs", crtc->cursor->base.id, crtc->cursor->name); cursor_with_crtc |= drm_plane_mask(crtc->cursor); } } drm_for_each_plane(plane, dev) { if (plane->type == DRM_PLANE_TYPE_PRIMARY) num_primary++; } WARN(num_primary != dev->mode_config.num_crtc, "Must have as many primary planes as there are CRTCs, but have %u primary planes and %u CRTCs", num_primary, dev->mode_config.num_crtc); }
7 7 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 // SPDX-License-Identifier: GPL-2.0-or-later /* * PTP virtual clock driver * * Copyright 2021 NXP */ #include <linux/slab.h> #include <linux/hashtable.h> #include "ptp_private.h" #define PTP_VCLOCK_CC_SHIFT 31 #define PTP_VCLOCK_CC_MULT (1 << PTP_VCLOCK_CC_SHIFT) #define PTP_VCLOCK_FADJ_SHIFT 9 #define PTP_VCLOCK_FADJ_DENOMINATOR 15625ULL #define PTP_VCLOCK_REFRESH_INTERVAL (HZ * 2) /* protects vclock_hash addition/deletion */ static DEFINE_SPINLOCK(vclock_hash_lock); static DEFINE_READ_MOSTLY_HASHTABLE(vclock_hash, 8); static void ptp_vclock_hash_add(struct ptp_vclock *vclock) { spin_lock(&vclock_hash_lock); hlist_add_head_rcu(&vclock->vclock_hash_node, &vclock_hash[vclock->clock->index % HASH_SIZE(vclock_hash)]); spin_unlock(&vclock_hash_lock); } static void ptp_vclock_hash_del(struct ptp_vclock *vclock) { spin_lock(&vclock_hash_lock); hlist_del_init_rcu(&vclock->vclock_hash_node); spin_unlock(&vclock_hash_lock); synchronize_rcu(); } static int ptp_vclock_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) { struct ptp_vclock *vclock = info_to_vclock(ptp); s64 adj; adj = (s64)scaled_ppm << PTP_VCLOCK_FADJ_SHIFT; adj = div_s64(adj, PTP_VCLOCK_FADJ_DENOMINATOR); if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; timecounter_read(&vclock->tc); vclock->cc.mult = PTP_VCLOCK_CC_MULT + adj; mutex_unlock(&vclock->lock); return 0; } static int ptp_vclock_adjtime(struct ptp_clock_info *ptp, s64 delta) { struct ptp_vclock *vclock = info_to_vclock(ptp); if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; timecounter_adjtime(&vclock->tc, delta); mutex_unlock(&vclock->lock); return 0; } static int ptp_vclock_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) { struct ptp_vclock *vclock = info_to_vclock(ptp); u64 ns; if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; ns = timecounter_read(&vclock->tc); mutex_unlock(&vclock->lock); *ts = ns_to_timespec64(ns); return 0; } static int ptp_vclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, struct ptp_system_timestamp *sts) { struct ptp_vclock *vclock = info_to_vclock(ptp); struct ptp_clock *pptp = vclock->pclock; struct timespec64 pts; int err; u64 ns; err = pptp->info->getcyclesx64(pptp->info, &pts, sts); if (err) return err; if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; ns = timecounter_cyc2time(&vclock->tc, timespec64_to_ns(&pts)); mutex_unlock(&vclock->lock); *ts = ns_to_timespec64(ns); return 0; } static int ptp_vclock_settime(struct ptp_clock_info *ptp, const struct timespec64 *ts) { struct ptp_vclock *vclock = info_to_vclock(ptp); u64 ns = timespec64_to_ns(ts); if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; timecounter_init(&vclock->tc, &vclock->cc, ns); mutex_unlock(&vclock->lock); return 0; } static int ptp_vclock_getcrosststamp(struct ptp_clock_info *ptp, struct system_device_crosststamp *xtstamp) { struct ptp_vclock *vclock = info_to_vclock(ptp); struct ptp_clock *pptp = vclock->pclock; int err; u64 ns; err = pptp->info->getcrosscycles(pptp->info, xtstamp); if (err) return err; if (mutex_lock_interruptible(&vclock->lock)) return -EINTR; ns = timecounter_cyc2time(&vclock->tc, ktime_to_ns(xtstamp->device)); mutex_unlock(&vclock->lock); xtstamp->device = ns_to_ktime(ns); return 0; } static long ptp_vclock_refresh(struct ptp_clock_info *ptp) { struct ptp_vclock *vclock = info_to_vclock(ptp); struct timespec64 ts; ptp_vclock_gettime(&vclock->info, &ts); return PTP_VCLOCK_REFRESH_INTERVAL; } static const struct ptp_clock_info ptp_vclock_info = { .owner = THIS_MODULE, .name = "ptp virtual clock", .max_adj = 500000000, .adjfine = ptp_vclock_adjfine, .adjtime = ptp_vclock_adjtime, .settime64 = ptp_vclock_settime, .do_aux_work = ptp_vclock_refresh, }; static u64 ptp_vclock_read(const struct cyclecounter *cc) { struct ptp_vclock *vclock = cc_to_vclock(cc); struct ptp_clock *ptp = vclock->pclock; struct timespec64 ts = {}; ptp->info->getcycles64(ptp->info, &ts); return timespec64_to_ns(&ts); } static const struct cyclecounter ptp_vclock_cc = { .read = ptp_vclock_read, .mask = CYCLECOUNTER_MASK(32), .mult = PTP_VCLOCK_CC_MULT, .shift = PTP_VCLOCK_CC_SHIFT, }; struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock) { struct ptp_vclock *vclock; vclock = kzalloc(sizeof(*vclock), GFP_KERNEL); if (!vclock) return NULL; vclock->pclock = pclock; vclock->info = ptp_vclock_info; if (pclock->info->getcyclesx64) vclock->info.gettimex64 = ptp_vclock_gettimex; else vclock->info.gettime64 = ptp_vclock_gettime; if (pclock->info->getcrosscycles) vclock->info.getcrosststamp = ptp_vclock_getcrosststamp; vclock->cc = ptp_vclock_cc; snprintf(vclock->info.name, PTP_CLOCK_NAME_LEN, "ptp%d_virt", pclock->index); INIT_HLIST_NODE(&vclock->vclock_hash_node); mutex_init(&vclock->lock); vclock->clock = ptp_clock_register(&vclock->info, &pclock->dev); if (IS_ERR_OR_NULL(vclock->clock)) { kfree(vclock); return NULL; } timecounter_init(&vclock->tc, &vclock->cc, 0); ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL); ptp_vclock_hash_add(vclock); return vclock; } void ptp_vclock_unregister(struct ptp_vclock *vclock) { ptp_vclock_hash_del(vclock); ptp_clock_unregister(vclock->clock); kfree(vclock); } #if IS_BUILTIN(CONFIG_PTP_1588_CLOCK) int ptp_get_vclocks_index(int pclock_index, int **vclock_index) { char name[PTP_CLOCK_NAME_LEN] = ""; struct ptp_clock *ptp; struct device *dev; int num = 0; if (pclock_index < 0) return num; snprintf(name, PTP_CLOCK_NAME_LEN, "ptp%d", pclock_index); dev = class_find_device_by_name(ptp_class, name); if (!dev) return num; ptp = dev_get_drvdata(dev); if (mutex_lock_interruptible(&ptp->n_vclocks_mux)) { put_device(dev); return num; } *vclock_index = kzalloc(sizeof(int) * ptp->n_vclocks, GFP_KERNEL); if (!(*vclock_index)) goto out; memcpy(*vclock_index, ptp->vclock_index, sizeof(int) * ptp->n_vclocks); num = ptp->n_vclocks; out: mutex_unlock(&ptp->n_vclocks_mux); put_device(dev); return num; } EXPORT_SYMBOL(ptp_get_vclocks_index); ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, int vclock_index) { unsigned int hash = vclock_index % HASH_SIZE(vclock_hash); struct ptp_vclock *vclock; u64 ns; u64 vclock_ns = 0; ns = ktime_to_ns(*hwtstamp); rcu_read_lock(); hlist_for_each_entry_rcu(vclock, &vclock_hash[hash], vclock_hash_node) { if (vclock->clock->index != vclock_index) continue; if (mutex_lock_interruptible(&vclock->lock)) break; vclock_ns = timecounter_cyc2time(&vclock->tc, ns); mutex_unlock(&vclock->lock); break; } rcu_read_unlock(); return ns_to_ktime(vclock_ns); } EXPORT_SYMBOL(ptp_convert_timestamp); #endif
28 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> */ #ifndef __XFS_AG_RESV_H__ #define __XFS_AG_RESV_H__ int xfs_ag_resv_free(struct xfs_perag *pag); int xfs_ag_resv_init(struct xfs_perag *pag, struct xfs_trans *tp); bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type); xfs_extlen_t xfs_ag_resv_needed(struct xfs_perag *pag, enum xfs_ag_resv_type type); void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, struct xfs_alloc_arg *args); void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, struct xfs_trans *tp, xfs_extlen_t len); static inline struct xfs_ag_resv * xfs_perag_resv( struct xfs_perag *pag, enum xfs_ag_resv_type type) { switch (type) { case XFS_AG_RESV_METADATA: return &pag->pag_meta_resv; case XFS_AG_RESV_RMAPBT: return &pag->pag_rmapbt_resv; default: return NULL; } } /* * RMAPBT reservation accounting wrappers. Since rmapbt blocks are sourced from * the AGFL, they are allocated one at a time and the reservation updates don't * require a transaction. */ static inline void xfs_ag_resv_rmapbt_alloc( struct xfs_mount *mp, xfs_agnumber_t agno) { struct xfs_alloc_arg args = { NULL }; struct xfs_perag *pag; args.len = 1; pag = xfs_perag_get(mp, agno); xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args); xfs_perag_put(pag); } #endif /* __XFS_AG_RESV_H__ */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 // SPDX-License-Identifier: GPL-2.0 #include <linux/cpufreq.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> extern const struct seq_operations cpuinfo_op; static int cpuinfo_open(struct inode *inode, struct file *file) { return seq_open(file, &cpuinfo_op); } static const struct proc_ops cpuinfo_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = cpuinfo_open, .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = seq_release, }; static int __init proc_cpuinfo_init(void) { proc_create("cpuinfo", 0, NULL, &cpuinfo_proc_ops); return 0; } fs_initcall(proc_cpuinfo_init);
7 3 6 5 3 3 3 5 1 4 5 5 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 // SPDX-License-Identifier: GPL-2.0-or-later /* * CMAC: Cipher Block Mode for Authentication * * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * Based on work by: * Copyright © 2013 Tom St Denis <tstdenis@elliptictech.com> * Based on crypto/xcbc.c: * Copyright © 2006 USAGI/WIDE Project, * Author: Kazunori Miyazawa <miyazawa@linux-ipv6.org> */ #include <crypto/internal/cipher.h> #include <crypto/internal/hash.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> /* * +------------------------ * | <parent tfm> * +------------------------ * | cmac_tfm_ctx * +------------------------ * | consts (block size * 2) * +------------------------ */ struct cmac_tfm_ctx { struct crypto_cipher *child; __be64 consts[]; }; /* * +------------------------ * | <shash desc> * +------------------------ * | cmac_desc_ctx * +------------------------ * | odds (block size) * +------------------------ * | prev (block size) * +------------------------ */ struct cmac_desc_ctx { unsigned int len; u8 odds[]; }; static int crypto_cmac_digest_setkey(struct crypto_shash *parent, const u8 *inkey, unsigned int keylen) { struct cmac_tfm_ctx *ctx = crypto_shash_ctx(parent); unsigned int bs = crypto_shash_blocksize(parent); __be64 *consts = ctx->consts; u64 _const[2]; int i, err = 0; u8 msb_mask, gfmask; err = crypto_cipher_setkey(ctx->child, inkey, keylen); if (err) return err; /* encrypt the zero block */ memset(consts, 0, bs); crypto_cipher_encrypt_one(ctx->child, (u8 *)consts, (u8 *)consts); switch (bs) { case 16: gfmask = 0x87; _const[0] = be64_to_cpu(consts[1]); _const[1] = be64_to_cpu(consts[0]); /* gf(2^128) multiply zero-ciphertext with u and u^2 */ for (i = 0; i < 4; i += 2) { msb_mask = ((s64)_const[1] >> 63) & gfmask; _const[1] = (_const[1] << 1) | (_const[0] >> 63); _const[0] = (_const[0] << 1) ^ msb_mask; consts[i + 0] = cpu_to_be64(_const[1]); consts[i + 1] = cpu_to_be64(_const[0]); } break; case 8: gfmask = 0x1B; _const[0] = be64_to_cpu(consts[0]); /* gf(2^64) multiply zero-ciphertext with u and u^2 */ for (i = 0; i < 2; i++) { msb_mask = ((s64)_const[0] >> 63) & gfmask; _const[0] = (_const[0] << 1) ^ msb_mask; consts[i] = cpu_to_be64(_const[0]); } break; } return 0; } static int crypto_cmac_digest_init(struct shash_desc *pdesc) { struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc); int bs = crypto_shash_blocksize(pdesc->tfm); u8 *prev = &ctx->odds[bs]; ctx->len = 0; memset(prev, 0, bs); return 0; } static int crypto_cmac_digest_update(struct shash_desc *pdesc, const u8 *p, unsigned int len) { struct crypto_shash *parent = pdesc->tfm; struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent); struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_blocksize(parent); u8 *odds = ctx->odds; u8 *prev = odds + bs; /* checking the data can fill the block */ if ((ctx->len + len) <= bs) { memcpy(odds + ctx->len, p, len); ctx->len += len; return 0; } /* filling odds with new data and encrypting it */ memcpy(odds + ctx->len, p, bs - ctx->len); len -= bs - ctx->len; p += bs - ctx->len; crypto_xor(prev, odds, bs); crypto_cipher_encrypt_one(tfm, prev, prev); /* clearing the length */ ctx->len = 0; /* encrypting the rest of data */ while (len > bs) { crypto_xor(prev, p, bs); crypto_cipher_encrypt_one(tfm, prev, prev); p += bs; len -= bs; } /* keeping the surplus of blocksize */ if (len) { memcpy(odds, p, len); ctx->len = len; } return 0; } static int crypto_cmac_digest_final(struct shash_desc *pdesc, u8 *out) { struct crypto_shash *parent = pdesc->tfm; struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent); struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_blocksize(parent); u8 *odds = ctx->odds; u8 *prev = odds + bs; unsigned int offset = 0; if (ctx->len != bs) { unsigned int rlen; u8 *p = odds + ctx->len; *p = 0x80; p++; rlen = bs - ctx->len - 1; if (rlen) memset(p, 0, rlen); offset += bs; } crypto_xor(prev, odds, bs); crypto_xor(prev, (const u8 *)tctx->consts + offset, bs); crypto_cipher_encrypt_one(tfm, out, prev); return 0; } static int cmac_init_tfm(struct crypto_shash *tfm) { struct shash_instance *inst = shash_alg_instance(tfm); struct cmac_tfm_ctx *ctx = crypto_shash_ctx(tfm); struct crypto_cipher_spawn *spawn; struct crypto_cipher *cipher; spawn = shash_instance_ctx(inst); cipher = crypto_spawn_cipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; return 0; } static int cmac_clone_tfm(struct crypto_shash *tfm, struct crypto_shash *otfm) { struct cmac_tfm_ctx *octx = crypto_shash_ctx(otfm); struct cmac_tfm_ctx *ctx = crypto_shash_ctx(tfm); struct crypto_cipher *cipher; cipher = crypto_clone_cipher(octx->child); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; return 0; } static void cmac_exit_tfm(struct crypto_shash *tfm) { struct cmac_tfm_ctx *ctx = crypto_shash_ctx(tfm); crypto_free_cipher(ctx->child); } static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb) { struct shash_instance *inst; struct crypto_cipher_spawn *spawn; struct crypto_alg *alg; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = shash_instance_ctx(inst); err = crypto_grab_cipher(spawn, shash_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_cipher_alg(spawn); switch (alg->cra_blocksize) { case 16: case 8: break; default: err = -EINVAL; goto err_free_inst; } err = crypto_inst_setname(shash_crypto_instance(inst), tmpl->name, alg); if (err) goto err_free_inst; inst->alg.base.cra_priority = alg->cra_priority; inst->alg.base.cra_blocksize = alg->cra_blocksize; inst->alg.base.cra_ctxsize = sizeof(struct cmac_tfm_ctx) + alg->cra_blocksize * 2; inst->alg.digestsize = alg->cra_blocksize; inst->alg.descsize = sizeof(struct cmac_desc_ctx) + alg->cra_blocksize * 2; inst->alg.init = crypto_cmac_digest_init; inst->alg.update = crypto_cmac_digest_update; inst->alg.final = crypto_cmac_digest_final; inst->alg.setkey = crypto_cmac_digest_setkey; inst->alg.init_tfm = cmac_init_tfm; inst->alg.clone_tfm = cmac_clone_tfm; inst->alg.exit_tfm = cmac_exit_tfm; inst->free = shash_free_singlespawn_instance; err = shash_register_instance(tmpl, inst); if (err) { err_free_inst: shash_free_singlespawn_instance(inst); } return err; } static struct crypto_template crypto_cmac_tmpl = { .name = "cmac", .create = cmac_create, .module = THIS_MODULE, }; static int __init crypto_cmac_module_init(void) { return crypto_register_template(&crypto_cmac_tmpl); } static void __exit crypto_cmac_module_exit(void) { crypto_unregister_template(&crypto_cmac_tmpl); } subsys_initcall(crypto_cmac_module_init); module_exit(crypto_cmac_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("CMAC keyed hash algorithm"); MODULE_ALIAS_CRYPTO("cmac"); MODULE_IMPORT_NS(CRYPTO_INTERNAL);
9 186 12 7 2 10 158 15 14 202 180 21 34 24 11 1554 1627 2 168 131 61 30 3 30 1 27 3 26 148 26 216 216 216 216 216 216 157 58 23 12 189 189 189 189 189 184 6 79 79 76 78 79 79 29 79 78 2 1 1 285 277 8 18 270 305 305 305 79 79 163 88 242 79 163 14 76 219 160 56 260 89 199 39 285 285 14 6 24 4 8 1 3 2 22 9 4 1 1 8 6 6 6 6 497 321 1 4 16 16 16 16 16 4 12 12 4 1 1 6 6 6 6 6 6 6 12 119 119 119 118 119 119 119 72 72 72 72 7 7 47 49 7 7 7 7 49 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Support for INET connection oriented protocols. * * Authors: See the TCP sources */ #include <linux/module.h> #include <linux/jhash.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/ip.h> #include <net/route.h> #include <net/tcp_states.h> #include <net/xfrm.h> #include <net/tcp.h> #include <net/sock_reuseport.h> #include <net/addrconf.h> #if IS_ENABLED(CONFIG_IPV6) /* match_sk*_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses * if IPv6 only, and any IPv4 addresses * if not IPv6 only * match_sk*_wildcard == false: addresses must be exactly the same, i.e. * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, * and 0.0.0.0 equals to 0.0.0.0 only */ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6, const struct in6_addr *sk2_rcv_saddr6, __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, bool sk1_ipv6only, bool sk2_ipv6only, bool match_sk1_wildcard, bool match_sk2_wildcard) { int addr_type = ipv6_addr_type(sk1_rcv_saddr6); int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; /* if both are mapped, treat as IPv4 */ if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) return true; return (match_sk1_wildcard && !sk1_rcv_saddr) || (match_sk2_wildcard && !sk2_rcv_saddr); } return false; } if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) return true; if (addr_type2 == IPV6_ADDR_ANY && match_sk2_wildcard && !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) return true; if (addr_type == IPV6_ADDR_ANY && match_sk1_wildcard && !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) return true; if (sk2_rcv_saddr6 && ipv6_addr_equal(sk1_rcv_saddr6, sk2_rcv_saddr6)) return true; return false; } #endif /* match_sk*_wildcard == true: 0.0.0.0 equals to any IPv4 addresses * match_sk*_wildcard == false: addresses must be exactly the same, i.e. * 0.0.0.0 only equals to 0.0.0.0 */ static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr, bool sk2_ipv6only, bool match_sk1_wildcard, bool match_sk2_wildcard) { if (!sk2_ipv6only) { if (sk1_rcv_saddr == sk2_rcv_saddr) return true; return (match_sk1_wildcard && !sk1_rcv_saddr) || (match_sk2_wildcard && !sk2_rcv_saddr); } return false; } bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&sk->sk_v6_rcv_saddr, inet6_rcv_saddr(sk2), sk->sk_rcv_saddr, sk2->sk_rcv_saddr, ipv6_only_sock(sk), ipv6_only_sock(sk2), match_wildcard, match_wildcard); #endif return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr, ipv6_only_sock(sk2), match_wildcard, match_wildcard); } EXPORT_SYMBOL(inet_rcv_saddr_equal); bool inet_rcv_saddr_any(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return ipv6_addr_any(&sk->sk_v6_rcv_saddr); #endif return !sk->sk_rcv_saddr; } /** * inet_sk_get_local_port_range - fetch ephemeral ports range * @sk: socket * @low: pointer to low port * @high: pointer to high port * * Fetch netns port range (/proc/sys/net/ipv4/ip_local_port_range) * Range can be overridden if socket got IP_LOCAL_PORT_RANGE option. * Returns true if IP_LOCAL_PORT_RANGE was set on this socket. */ bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high) { int lo, hi, sk_lo, sk_hi; bool local_range = false; u32 sk_range; inet_get_local_port_range(sock_net(sk), &lo, &hi); sk_range = READ_ONCE(inet_sk(sk)->local_port_range); if (unlikely(sk_range)) { sk_lo = sk_range & 0xffff; sk_hi = sk_range >> 16; if (lo <= sk_lo && sk_lo <= hi) lo = sk_lo; if (lo <= sk_hi && sk_hi <= hi) hi = sk_hi; local_range = true; } *low = lo; *high = hi; return local_range; } EXPORT_SYMBOL(inet_sk_get_local_port_range); static bool inet_use_bhash2_on_bind(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); if (addr_type == IPV6_ADDR_ANY) return false; if (addr_type != IPV6_ADDR_MAPPED) return true; } #endif return sk->sk_rcv_saddr != htonl(INADDR_ANY); } static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { int bound_dev_if2; if (sk == sk2) return false; bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if); if (!sk->sk_bound_dev_if || !bound_dev_if2 || sk->sk_bound_dev_if == bound_dev_if2) { if (sk->sk_reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) { if (!relax || (!reuseport_ok && sk->sk_reuseport && sk2->sk_reuseport && reuseport_cb_ok && (sk2->sk_state == TCP_TIME_WAIT || uid_eq(sk_uid, sock_i_uid(sk2))))) return true; } else if (!reuseport_ok || !sk->sk_reuseport || !sk2->sk_reuseport || !reuseport_cb_ok || (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(sk_uid, sock_i_uid(sk2)))) { return true; } } return false; } static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) return false; return inet_bind_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok); } static bool inet_bhash2_conflict(const struct sock *sk, const struct inet_bind2_bucket *tb2, kuid_t sk_uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { struct sock *sk2; sk_for_each_bound(sk2, &tb2->owners) { if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok)) return true; } return false; } #define sk_for_each_bound_bhash(__sk, __tb2, __tb) \ hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \ sk_for_each_bound(sk2, &(__tb2)->owners) /* This should be called only when the tb and tb2 hashbuckets' locks are held */ static int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, const struct inet_bind2_bucket *tb2, /* may be null */ bool relax, bool reuseport_ok) { kuid_t uid = sock_i_uid((struct sock *)sk); struct sock_reuseport *reuseport_cb; bool reuseport_cb_ok; struct sock *sk2; rcu_read_lock(); reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); rcu_read_unlock(); /* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if * ipv4) should have been checked already. We need to do these two * checks separately because their spinlocks have to be acquired/released * independently of each other, to prevent possible deadlocks */ if (inet_use_bhash2_on_bind(sk)) return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok); /* Unlike other sk lookup places we do not check * for sk_net here, since _all_ the socks listed * in tb->owners and tb2->owners list belong * to the same net - the one this bucket belongs to. */ sk_for_each_bound_bhash(sk2, tb2, tb) { if (!inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok)) continue; if (inet_rcv_saddr_equal(sk, sk2, true)) return true; } return false; } /* Determine if there is a bind conflict with an existing IPV6_ADDR_ANY (if ipv6) or * INADDR_ANY (if ipv4) socket. * * Caller must hold bhash hashbucket lock with local bh disabled, to protect * against concurrent binds on the port for addr any */ static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l3mdev, bool relax, bool reuseport_ok) { kuid_t uid = sock_i_uid((struct sock *)sk); const struct net *net = sock_net(sk); struct sock_reuseport *reuseport_cb; struct inet_bind_hashbucket *head2; struct inet_bind2_bucket *tb2; bool reuseport_cb_ok; rcu_read_lock(); reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */ reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); rcu_read_unlock(); head2 = inet_bhash2_addr_any_hashbucket(sk, net, port); spin_lock(&head2->lock); inet_bind_bucket_for_each(tb2, &head2->chain) if (inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk)) break; if (tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok)) { spin_unlock(&head2->lock); return true; } spin_unlock(&head2->lock); return false; } /* * Find an open port number for the socket. Returns with the * inet_bind_hashbucket locks held if successful. */ static struct inet_bind_hashbucket * inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, struct inet_bind2_bucket **tb2_ret, struct inet_bind_hashbucket **head2_ret, int *port_ret) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); int i, low, high, attempt_half, port, l3mdev; struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; u32 remaining, offset; bool relax = false; l3mdev = inet_sk_bound_l3mdev(sk); ports_exhausted: attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; other_half_scan: inet_sk_get_local_port_range(sk, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ if (high - low < 4) attempt_half = 0; if (attempt_half) { int half = low + (((high - low) >> 2) << 1); if (attempt_half == 1) high = half; else low = half; } remaining = high - low; if (likely(remaining > 1)) remaining &= ~1U; offset = get_random_u32_below(remaining); /* __inet_hash_connect() favors ports having @low parity * We do the opposite to not pollute connect() users. */ offset |= 1U; other_parity_scan: port = low + offset; for (i = 0; i < remaining; i += 2, port += 2) { if (unlikely(port >= high)) port -= remaining; if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); if (inet_use_bhash2_on_bind(sk)) { if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, relax, false)) goto next_port; } head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); inet_bind_bucket_for_each(tb, &head->chain) if (inet_bind_bucket_match(tb, net, port, l3mdev)) { if (!inet_csk_bind_conflict(sk, tb, tb2, relax, false)) goto success; spin_unlock(&head2->lock); goto next_port; } tb = NULL; goto success; next_port: spin_unlock_bh(&head->lock); cond_resched(); } offset--; if (!(offset & 1)) goto other_parity_scan; if (attempt_half == 1) { /* OK we now try the upper half of the range */ attempt_half = 2; goto other_half_scan; } if (READ_ONCE(net->ipv4.sysctl_ip_autobind_reuse) && !relax) { /* We still have a chance to connect to different destinations */ relax = true; goto ports_exhausted; } return NULL; success: *port_ret = port; *tb_ret = tb; *tb2_ret = tb2; *head2_ret = head2; return head; } static inline int sk_reuseport_match(struct inet_bind_bucket *tb, struct sock *sk) { kuid_t uid = sock_i_uid(sk); if (tb->fastreuseport <= 0) return 0; if (!sk->sk_reuseport) return 0; if (rcu_access_pointer(sk->sk_reuseport_cb)) return 0; if (!uid_eq(tb->fastuid, uid)) return 0; /* We only need to check the rcv_saddr if this tb was once marked * without fastreuseport and then was reset, as we can only know that * the fast_*rcv_saddr doesn't have any conflicts with the socks on the * owners list. */ if (tb->fastreuseport == FASTREUSEPORT_ANY) return 1; #if IS_ENABLED(CONFIG_IPV6) if (tb->fast_sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr, inet6_rcv_saddr(sk), tb->fast_rcv_saddr, sk->sk_rcv_saddr, tb->fast_ipv6_only, ipv6_only_sock(sk), true, false); #endif return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr, ipv6_only_sock(sk), true, false); } void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, struct sock *sk) { kuid_t uid = sock_i_uid(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; if (hlist_empty(&tb->bhash2)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif } else { tb->fastreuseport = 0; } } else { if (!reuse) tb->fastreuse = 0; if (sk->sk_reuseport) { /* We didn't match or we don't have fastreuseport set on * the tb, but we have sk_reuseport set on this socket * and we know that there are no bind conflicts with * this socket in this tb, so reset our tb's reuseport * settings so that any subsequent sockets that match * our current socket will be put on the fast path. * * If we reset we need to set FASTREUSEPORT_STRICT so we * do extra checking for all subsequent sk_reuseport * socks. */ if (!sk_reuseport_match(tb, sk)) { tb->fastreuseport = FASTREUSEPORT_STRICT; tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif } } else { tb->fastreuseport = 0; } } } /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. * We try to allocate an odd port (and leave even ports for connect()) */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; bool found_port = false, check_bind_conflict = true; bool bhash_created = false, bhash2_created = false; int ret = -EADDRINUSE, port = snum, l3mdev; struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2 = NULL; struct inet_bind_bucket *tb = NULL; bool head2_lock_acquired = false; struct net *net = sock_net(sk); l3mdev = inet_sk_bound_l3mdev(sk); if (!port) { head = inet_csk_find_open_port(sk, &tb, &tb2, &head2, &port); if (!head) return ret; head2_lock_acquired = true; if (tb && tb2) goto success; found_port = true; } else { head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; spin_lock_bh(&head->lock); inet_bind_bucket_for_each(tb, &head->chain) if (inet_bind_bucket_match(tb, net, port, l3mdev)) break; } if (!tb) { tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port, l3mdev); if (!tb) goto fail_unlock; bhash_created = true; } if (!found_port) { if (!hlist_empty(&tb->bhash2)) { if (sk->sk_reuse == SK_FORCE_REUSE || (tb->fastreuse > 0 && reuse) || sk_reuseport_match(tb, sk)) check_bind_conflict = false; } if (check_bind_conflict && inet_use_bhash2_on_bind(sk)) { if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, true, true)) goto fail_unlock; } head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); spin_lock(&head2->lock); head2_lock_acquired = true; tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); } if (!tb2) { tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, head2, tb, sk); if (!tb2) goto fail_unlock; bhash2_created = true; } if (!found_port && check_bind_conflict) { if (inet_csk_bind_conflict(sk, tb, tb2, true, true)) goto fail_unlock; } success: inet_csk_update_fastreuse(tb, sk); if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, tb2, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2); ret = 0; fail_unlock: if (ret) { if (bhash2_created) inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2); if (bhash_created) inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); } if (head2_lock_acquired) spin_unlock(&head2->lock); spin_unlock_bh(&head->lock); return ret; } EXPORT_SYMBOL_GPL(inet_csk_get_port); /* * Wait for an incoming connection, avoid race conditions. This must be called * with the socket locked. */ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) { struct inet_connection_sock *icsk = inet_csk(sk); DEFINE_WAIT(wait); int err; /* * True wake-one mechanism for incoming connections: only * one process gets woken up, not the 'whole herd'. * Since we do not 'race & poll' for established sockets * anymore, the common case will execute the loop only once. * * Subtle issue: "add_wait_queue_exclusive()" will be added * after any current non-exclusive waiters, and we know that * it will always _stay_ after any new non-exclusive waiters * because all non-exclusive waiters are added at the * beginning of the wait-queue. As such, it's ok to "drop" * our exclusiveness temporarily when we get woken up without * having to remove and re-insert us on the wait queue. */ for (;;) { prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) timeo = schedule_timeout(timeo); sched_annotate_sleep(); lock_sock(sk); err = 0; if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) break; err = -EINVAL; if (sk->sk_state != TCP_LISTEN) break; err = sock_intr_errno(timeo); if (signal_pending(current)) break; err = -EAGAIN; if (!timeo) break; } finish_wait(sk_sleep(sk), &wait); return err; } /* * This will accept the next outstanding connection. */ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock *req; struct sock *newsk; int error; lock_sock(sk); /* We need to make sure that this socket is listening, * and that it has something pending. */ error = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out_err; /* Find already established connection */ if (reqsk_queue_empty(queue)) { long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); /* If this is a non blocking socket don't sleep */ error = -EAGAIN; if (!timeo) goto out_err; error = inet_csk_wait_for_connect(sk, timeo); if (error) goto out_err; } req = reqsk_queue_remove(queue, sk); newsk = req->sk; if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { spin_lock_bh(&queue->fastopenq.lock); if (tcp_rsk(req)->tfo_listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to * NULL to signify that the child socket is taken * so reqsk_fastopen_remove() will free the req * when 3WHS finishes (or is aborted). */ req->sk = NULL; req = NULL; } spin_unlock_bh(&queue->fastopenq.lock); } out: release_sock(sk); if (newsk && mem_cgroup_sockets_enabled) { int amt = 0; /* atomically get the memory usage, set and charge the * newsk->sk_memcg. */ lock_sock(newsk); mem_cgroup_sk_alloc(newsk); if (newsk->sk_memcg) { /* The socket has not been accepted yet, no need * to look at newsk->sk_wmem_queued. */ amt = sk_mem_pages(newsk->sk_forward_alloc + atomic_read(&newsk->sk_rmem_alloc)); } if (amt) mem_cgroup_charge_skmem(newsk->sk_memcg, amt, GFP_KERNEL | __GFP_NOFAIL); release_sock(newsk); } if (req) reqsk_put(req); if (newsk) inet_init_csk_locks(newsk); return newsk; out_err: newsk = NULL; req = NULL; *err = error; goto out; } EXPORT_SYMBOL(inet_csk_accept); /* * Using different timers for retransmit, delayed acks and probes * We may wish use just one timer maintaining a list of expire jiffies * to optimize. */ void inet_csk_init_xmit_timers(struct sock *sk, void (*retransmit_handler)(struct timer_list *t), void (*delack_handler)(struct timer_list *t), void (*keepalive_handler)(struct timer_list *t)) { struct inet_connection_sock *icsk = inet_csk(sk); timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0); timer_setup(&icsk->icsk_delack_timer, delack_handler, 0); timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } EXPORT_SYMBOL(inet_csk_init_xmit_timers); void inet_csk_clear_xmit_timers(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_pending = icsk->icsk_ack.pending = 0; sk_stop_timer(sk, &icsk->icsk_retransmit_timer); sk_stop_timer(sk, &icsk->icsk_delack_timer); sk_stop_timer(sk, &sk->sk_timer); } EXPORT_SYMBOL(inet_csk_clear_xmit_timers); void inet_csk_delete_keepalive_timer(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); } EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) { sk_reset_timer(sk, &sk->sk_timer, jiffies + len); } EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) { const struct inet_request_sock *ireq = inet_rsk(req); struct net *net = read_pnet(&ireq->ireq_net); struct ip_options_rcu *opt; struct rtable *rt; rcu_read_lock(); opt = rcu_dereference(ireq->ireq_opt); flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num), sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } EXPORT_SYMBOL_GPL(inet_csk_route_req); struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, const struct request_sock *req) { const struct inet_request_sock *ireq = inet_rsk(req); struct net *net = read_pnet(&ireq->ireq_net); struct inet_sock *newinet = inet_sk(newsk); struct ip_options_rcu *opt; struct flowi4 *fl4; struct rtable *rt; opt = rcu_dereference(ireq->ireq_opt); fl4 = &newinet->cork.fl.u.ip4; flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num), sk->sk_uid); security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; return &rt->dst; route_err: ip_rt_put(rt); no_route: __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); /* Decide when to expire the request and when to resend SYN-ACK */ static void syn_ack_recalc(struct request_sock *req, const int max_syn_ack_retries, const u8 rskq_defer_accept, int *expire, int *resend) { if (!rskq_defer_accept) { *expire = req->num_timeout >= max_syn_ack_retries; *resend = 1; return; } *expire = req->num_timeout >= max_syn_ack_retries && (!inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept); /* Do not resend while waiting for data after ACK, * start to resend on end of deferring period to give * last chance for data or ACK to create established socket. */ *resend = !inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept - 1; } int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) { int err = req->rsk_ops->rtx_syn_ack(parent, req); if (!err) req->num_retrans++; return err; } EXPORT_SYMBOL(inet_rtx_syn_ack); static struct request_sock *inet_reqsk_clone(struct request_sock *req, struct sock *sk) { struct sock *req_sk, *nreq_sk; struct request_sock *nreq; nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN); if (!nreq) { __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */ sock_put(sk); return NULL; } req_sk = req_to_sk(req); nreq_sk = req_to_sk(nreq); memcpy(nreq_sk, req_sk, offsetof(struct sock, sk_dontcopy_begin)); memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end, req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end)); sk_node_init(&nreq_sk->sk_node); nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping; #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping; #endif nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu; nreq->rsk_listener = sk; /* We need not acquire fastopenq->lock * because the child socket is locked in inet_csk_listen_stop(). */ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener) rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq); return nreq; } static void reqsk_queue_migrated(struct request_sock_queue *queue, const struct request_sock *req) { if (req->num_timeout == 0) atomic_inc(&queue->young); atomic_inc(&queue->qlen); } static void reqsk_migrate_reset(struct request_sock *req) { req->saved_syn = NULL; #if IS_ENABLED(CONFIG_IPV6) inet_rsk(req)->ipv6_opt = NULL; inet_rsk(req)->pktopts = NULL; #else inet_rsk(req)->ireq_opt = NULL; #endif } /* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock *req) { struct sock *sk = req_to_sk(req); bool found = false; if (sk_hashed(sk)) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash); spin_lock(lock); found = __sk_nulls_del_node_init_rcu(sk); spin_unlock(lock); } if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) reqsk_put(req); return found; } bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { bool unlinked = reqsk_queue_unlink(req); if (unlinked) { reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); reqsk_put(req); } return unlinked; } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) { inet_csk_reqsk_queue_drop(sk, req); reqsk_put(req); } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); static void reqsk_timer_handler(struct timer_list *t) { struct request_sock *req = from_timer(req, t, rsk_timer); struct request_sock *nreq = NULL, *oreq = req; struct sock *sk_listener = req->rsk_listener; struct inet_connection_sock *icsk; struct request_sock_queue *queue; struct net *net; int max_syn_ack_retries, qlen, expire = 0, resend = 0; if (inet_sk_state_load(sk_listener) != TCP_LISTEN) { struct sock *nsk; nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL); if (!nsk) goto drop; nreq = inet_reqsk_clone(req, nsk); if (!nreq) goto drop; /* The new timer for the cloned req can decrease the 2 * by calling inet_csk_reqsk_queue_drop_and_put(), so * hold another count to prevent use-after-free and * call reqsk_put() just before return. */ refcount_set(&nreq->rsk_refcnt, 2 + 1); timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED); reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req); req = nreq; sk_listener = nsk; } icsk = inet_csk(sk_listener); net = sock_net(sk_listener); max_syn_ack_retries = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_synack_retries); /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. * If synack was not acknowledged for 1 second, it means * one of the following things: synack was lost, ack was lost, * rtt is high or nobody planned to ack (i.e. synflood). * When server is a bit loaded, queue is populated with old * open requests, reducing effective size of queue. * When server is well loaded, queue size reduces to zero * after several minutes of work. It is not synflood, * it is normal operation. The solution is pruning * too old entries overriding normal timeout, when * situation becomes dangerous. * * Essentially, we reserve half of room for young * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ queue = &icsk->icsk_accept_queue; qlen = reqsk_queue_len(queue); if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) { int young = reqsk_queue_len_young(queue) << 1; while (max_syn_ack_retries > 2) { if (qlen < young) break; max_syn_ack_retries--; young <<= 1; } } syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept), &expire, &resend); req->rsk_ops->syn_ack_timeout(req); if (!expire && (!resend || !inet_rtx_syn_ack(sk_listener, req) || inet_rsk(req)->acked)) { if (req->num_timeout++ == 0) atomic_dec(&queue->young); mod_timer(&req->rsk_timer, jiffies + reqsk_timeout(req, TCP_RTO_MAX)); if (!nreq) return; if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { /* delete timer */ inet_csk_reqsk_queue_drop(sk_listener, nreq); goto no_ownership; } __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(oreq); reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq); reqsk_put(oreq); reqsk_put(nreq); return; } /* Even if we can clone the req, we may need not retransmit any more * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another * CPU may win the "own_req" race so that inet_ehash_insert() fails. */ if (nreq) { __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQFAILURE); no_ownership: reqsk_migrate_reset(nreq); reqsk_queue_removed(queue, nreq); __reqsk_free(nreq); } drop: inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq); } static void reqsk_queue_hash_req(struct request_sock *req, unsigned long timeout) { timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); inet_ehash_insert(req_to_sk(req), NULL, NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); refcount_set(&req->rsk_refcnt, 2 + 1); } void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout) { reqsk_queue_hash_req(req, timeout); inet_csk_reqsk_queue_added(sk); } EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, const gfp_t priority) { struct inet_connection_sock *icsk = inet_csk(newsk); if (!icsk->icsk_ulp_ops) return; icsk->icsk_ulp_ops->clone(req, newsk, priority); } /** * inet_csk_clone_lock - clone an inet socket, and lock its clone * @sk: the socket to clone * @req: request_sock * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) */ struct sock *inet_csk_clone_lock(const struct sock *sk, const struct request_sock *req, const gfp_t priority) { struct sock *newsk = sk_clone_lock(sk, priority); if (newsk) { struct inet_connection_sock *newicsk = inet_csk(newsk); inet_sk_set_state(newsk, TCP_SYN_RECV); newicsk->icsk_bind_hash = NULL; newicsk->icsk_bind2_hash = NULL; inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); /* listeners have SOCK_RCU_FREE, not the children */ sock_reset_flag(newsk, SOCK_RCU_FREE); inet_sk(newsk)->mc_list = NULL; newsk->sk_mark = inet_rsk(req)->ir_mark; atomic64_set(&newsk->sk_cookie, atomic64_read(&inet_rsk(req)->ir_cookie)); newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; newicsk->icsk_probes_out = 0; newicsk->icsk_probes_tstamp = 0; /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); inet_clone_ulp(req, newsk, priority); security_inet_csk_clone(newsk, req); } return newsk; } EXPORT_SYMBOL_GPL(inet_csk_clone_lock); /* * At this point, there should be no process reference to this * socket, and thus no user references at all. Therefore we * can assume the socket waitqueue is inactive and nobody will * try to jump onto it. */ void inet_csk_destroy_sock(struct sock *sk) { WARN_ON(sk->sk_state != TCP_CLOSE); WARN_ON(!sock_flag(sk, SOCK_DEAD)); /* It cannot be in hash table! */ WARN_ON(!sk_unhashed(sk)); /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */ WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash); sk->sk_prot->destroy(sk); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); this_cpu_dec(*sk->sk_prot->orphan_count); sock_put(sk); } EXPORT_SYMBOL(inet_csk_destroy_sock); /* This function allows to force a closure of a socket after the call to * tcp/dccp_create_openreq_child(). */ void inet_csk_prepare_forced_close(struct sock *sk) __releases(&sk->sk_lock.slock) { /* sk_clone_lock locked the socket and set refcnt to 2 */ bh_unlock_sock(sk); sock_put(sk); inet_csk_prepare_for_destroy_sock(sk); inet_sk(sk)->inet_num = 0; } EXPORT_SYMBOL(inet_csk_prepare_forced_close); static int inet_ulp_can_listen(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ulp_ops && !icsk->icsk_ulp_ops->clone) return -EINVAL; return 0; } int inet_csk_listen_start(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); int err; err = inet_ulp_can_listen(sk); if (unlikely(err)) return err; reqsk_queue_alloc(&icsk->icsk_accept_queue); sk->sk_ack_backlog = 0; inet_csk_delack_init(sk); /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). * It is OK, because this socket enters to hash table only * after validation is complete. */ inet_sk_state_store(sk, TCP_LISTEN); err = sk->sk_prot->get_port(sk, inet->inet_num); if (!err) { inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk); err = sk->sk_prot->hash(sk); if (likely(!err)) return 0; } inet_sk_set_state(sk, TCP_CLOSE); return err; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); static void inet_child_forget(struct sock *sk, struct request_sock *req, struct sock *child) { sk->sk_prot->disconnect(child, O_NONBLOCK); sock_orphan(child); this_cpu_inc(*sk->sk_prot->orphan_count); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); BUG_ON(sk != req->rsk_listener); /* Paranoid, to prevent race condition if * an inbound pkt destined for child is * blocked by sock lock in tcp_v4_rcv(). * Also to satisfy an assertion in * tcp_v4_destroy_sock(). */ RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL); } inet_csk_destroy_sock(child); } struct sock *inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, struct sock *child) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; spin_lock(&queue->rskq_lock); if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_child_forget(sk, req, child); child = NULL; } else { req->sk = child; req->dl_next = NULL; if (queue->rskq_accept_head == NULL) WRITE_ONCE(queue->rskq_accept_head, req); else queue->rskq_accept_tail->dl_next = req; queue->rskq_accept_tail = req; sk_acceptq_added(sk); } spin_unlock(&queue->rskq_lock); return child; } EXPORT_SYMBOL(inet_csk_reqsk_queue_add); struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req) { if (own_req) { inet_csk_reqsk_queue_drop(req->rsk_listener, req); reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); if (sk != req->rsk_listener) { /* another listening sk has been selected, * migrate the req to it. */ struct request_sock *nreq; /* hold a refcnt for the nreq->rsk_listener * which is assigned in inet_reqsk_clone() */ sock_hold(sk); nreq = inet_reqsk_clone(req, sk); if (!nreq) { inet_child_forget(sk, req, child); goto child_put; } refcount_set(&nreq->rsk_refcnt, 1); if (inet_csk_reqsk_queue_add(sk, nreq, child)) { __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(req); reqsk_put(req); return child; } __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); reqsk_migrate_reset(nreq); __reqsk_free(nreq); } else if (inet_csk_reqsk_queue_add(sk, req, child)) { return child; } } /* Too bad, another child took ownership of the request, undo. */ child_put: bh_unlock_sock(child); sock_put(child); return NULL; } EXPORT_SYMBOL(inet_csk_complete_hashdance); /* * This routine closes sockets which have been at least partially * opened, but not yet accepted. */ void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock *next, *req; /* Following specs, it would be better either to send FIN * (and enter FIN-WAIT-1, it is normal close) * or to send active reset (abort). * Certainly, it is pretty dangerous while synflood, but it is * bad justification for our negligence 8) * To be honest, we are not able to make either * of the variants now. --ANK */ while ((req = reqsk_queue_remove(queue, sk)) != NULL) { struct sock *child = req->sk, *nsk; struct request_sock *nreq; local_bh_disable(); bh_lock_sock(child); WARN_ON(sock_owned_by_user(child)); sock_hold(child); nsk = reuseport_migrate_sock(sk, child, NULL); if (nsk) { nreq = inet_reqsk_clone(req, nsk); if (nreq) { refcount_set(&nreq->rsk_refcnt, 1); if (inet_csk_reqsk_queue_add(nsk, nreq, child)) { __NET_INC_STATS(sock_net(nsk), LINUX_MIB_TCPMIGRATEREQSUCCESS); reqsk_migrate_reset(req); } else { __NET_INC_STATS(sock_net(nsk), LINUX_MIB_TCPMIGRATEREQFAILURE); reqsk_migrate_reset(nreq); __reqsk_free(nreq); } /* inet_csk_reqsk_queue_add() has already * called inet_child_forget() on failure case. */ goto skip_child_forget; } } inet_child_forget(sk, req, child); skip_child_forget: reqsk_put(req); bh_unlock_sock(child); local_bh_enable(); sock_put(child); cond_resched(); } if (queue->fastopenq.rskq_rst_head) { /* Free all the reqs queued in rskq_rst_head. */ spin_lock_bh(&queue->fastopenq.lock); req = queue->fastopenq.rskq_rst_head; queue->fastopenq.rskq_rst_head = NULL; spin_unlock_bh(&queue->fastopenq.lock); while (req != NULL) { next = req->dl_next; reqsk_put(req); req = next; } } WARN_ON_ONCE(sk->sk_ack_backlog); } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) { struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; const struct inet_sock *inet = inet_sk(sk); sin->sin_family = AF_INET; sin->sin_addr.s_addr = inet->inet_daddr; sin->sin_port = inet->inet_dport; } EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) { const struct inet_sock *inet = inet_sk(sk); const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; struct flowi4 *fl4; struct rtable *rt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; fl4 = &fl->u.ip4; rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); if (IS_ERR(rt)) rt = NULL; if (rt) sk_setup_caps(sk, &rt->dst); rcu_read_unlock(); return &rt->dst; } struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) { struct dst_entry *dst = __sk_dst_check(sk, 0); struct inet_sock *inet = inet_sk(sk); if (!dst) { dst = inet_csk_rebuild_route(sk, &inet->cork.fl); if (!dst) goto out; } dst->ops->update_pmtu(dst, sk, NULL, mtu, true); dst = __sk_dst_check(sk, 0); if (!dst) dst = inet_csk_rebuild_route(sk, &inet->cork.fl); out: return dst; } EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
16 16 17 16 16 34 35 16 2 129 129 129 129 126 5 18 3 16 16 16 3 3 1949 1920 110 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Pieter Beyens <pieter.beyens@eia.be> // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2018 Protonic, // Robin van der Gracht <robin@protonic.nl> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> /* Core of can-j1939 that links j1939 to CAN. */ #include <linux/can/can-ml.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/if_arp.h> #include <linux/module.h> #include "j1939-priv.h" MODULE_DESCRIPTION("PF_CAN SAE J1939"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("EIA Electronics (Kurt Van Dijck & Pieter Beyens)"); MODULE_ALIAS("can-proto-" __stringify(CAN_J1939)); /* LOWLEVEL CAN interface */ /* CAN_HDR: #bytes before can_frame data part */ #define J1939_CAN_HDR (offsetof(struct can_frame, data)) /* CAN_FTR: #bytes beyond data part */ #define J1939_CAN_FTR (sizeof(struct can_frame) - J1939_CAN_HDR - \ sizeof(((struct can_frame *)0)->data)) /* lowest layer */ static void j1939_can_recv(struct sk_buff *iskb, void *data) { struct j1939_priv *priv = data; struct sk_buff *skb; struct j1939_sk_buff_cb *skcb, *iskcb; struct can_frame *cf; /* make sure we only get Classical CAN frames */ if (!can_is_can_skb(iskb)) return; /* create a copy of the skb * j1939 only delivers the real data bytes, * the header goes into sockaddr. * j1939 may not touch the incoming skb in such way */ skb = skb_clone(iskb, GFP_ATOMIC); if (!skb) return; j1939_priv_get(priv); can_skb_set_owner(skb, iskb->sk); /* get a pointer to the header of the skb * the skb payload (pointer) is moved, so that the next skb_data * returns the actual payload */ cf = (void *)skb->data; skb_pull(skb, J1939_CAN_HDR); /* fix length, set to dlc, with 8 maximum */ skb_trim(skb, min_t(uint8_t, cf->len, 8)); /* set addr */ skcb = j1939_skb_to_cb(skb); memset(skcb, 0, sizeof(*skcb)); iskcb = j1939_skb_to_cb(iskb); skcb->tskey = iskcb->tskey; skcb->priority = (cf->can_id >> 26) & 0x7; skcb->addr.sa = cf->can_id; skcb->addr.pgn = (cf->can_id >> 8) & J1939_PGN_MAX; /* set default message type */ skcb->addr.type = J1939_TP; if (!j1939_address_is_valid(skcb->addr.sa)) { netdev_err_once(priv->ndev, "%s: sa is broadcast address, ignoring!\n", __func__); goto done; } if (j1939_pgn_is_pdu1(skcb->addr.pgn)) { /* Type 1: with destination address */ skcb->addr.da = skcb->addr.pgn; /* normalize pgn: strip dst address */ skcb->addr.pgn &= 0x3ff00; } else { /* set broadcast address */ skcb->addr.da = J1939_NO_ADDR; } /* update localflags */ read_lock_bh(&priv->lock); if (j1939_address_is_unicast(skcb->addr.sa) && priv->ents[skcb->addr.sa].nusers) skcb->flags |= J1939_ECU_LOCAL_SRC; if (j1939_address_is_unicast(skcb->addr.da) && priv->ents[skcb->addr.da].nusers) skcb->flags |= J1939_ECU_LOCAL_DST; read_unlock_bh(&priv->lock); /* deliver into the j1939 stack ... */ j1939_ac_recv(priv, skb); if (j1939_tp_recv(priv, skb)) /* this means the transport layer processed the message */ goto done; j1939_simple_recv(priv, skb); j1939_sk_recv(priv, skb); done: j1939_priv_put(priv); kfree_skb(skb); } /* NETDEV MANAGEMENT */ /* values for can_rx_(un)register */ #define J1939_CAN_ID CAN_EFF_FLAG #define J1939_CAN_MASK (CAN_EFF_FLAG | CAN_RTR_FLAG) static DEFINE_MUTEX(j1939_netdev_lock); static struct j1939_priv *j1939_priv_create(struct net_device *ndev) { struct j1939_priv *priv; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return NULL; rwlock_init(&priv->lock); INIT_LIST_HEAD(&priv->ecus); priv->ndev = ndev; kref_init(&priv->kref); kref_init(&priv->rx_kref); dev_hold(ndev); netdev_dbg(priv->ndev, "%s : 0x%p\n", __func__, priv); return priv; } static inline void j1939_priv_set(struct net_device *ndev, struct j1939_priv *priv) { struct can_ml_priv *can_ml = can_get_ml_priv(ndev); can_ml->j1939_priv = priv; } static void __j1939_priv_release(struct kref *kref) { struct j1939_priv *priv = container_of(kref, struct j1939_priv, kref); struct net_device *ndev = priv->ndev; netdev_dbg(priv->ndev, "%s: 0x%p\n", __func__, priv); WARN_ON_ONCE(!list_empty(&priv->active_session_list)); WARN_ON_ONCE(!list_empty(&priv->ecus)); WARN_ON_ONCE(!list_empty(&priv->j1939_socks)); dev_put(ndev); kfree(priv); } void j1939_priv_put(struct j1939_priv *priv) { kref_put(&priv->kref, __j1939_priv_release); } void j1939_priv_get(struct j1939_priv *priv) { kref_get(&priv->kref); } static int j1939_can_rx_register(struct j1939_priv *priv) { struct net_device *ndev = priv->ndev; int ret; j1939_priv_get(priv); ret = can_rx_register(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK, j1939_can_recv, priv, "j1939", NULL); if (ret < 0) { j1939_priv_put(priv); return ret; } return 0; } static void j1939_can_rx_unregister(struct j1939_priv *priv) { struct net_device *ndev = priv->ndev; can_rx_unregister(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK, j1939_can_recv, priv); /* The last reference of priv is dropped by the RCU deferred * j1939_sk_sock_destruct() of the last socket, so we can * safely drop this reference here. */ j1939_priv_put(priv); } static void __j1939_rx_release(struct kref *kref) __releases(&j1939_netdev_lock) { struct j1939_priv *priv = container_of(kref, struct j1939_priv, rx_kref); j1939_can_rx_unregister(priv); j1939_ecu_unmap_all(priv); j1939_priv_set(priv->ndev, NULL); mutex_unlock(&j1939_netdev_lock); } /* get pointer to priv without increasing ref counter */ static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev) { struct can_ml_priv *can_ml = can_get_ml_priv(ndev); return can_ml->j1939_priv; } static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev) { struct j1939_priv *priv; lockdep_assert_held(&j1939_netdev_lock); priv = j1939_ndev_to_priv(ndev); if (priv) j1939_priv_get(priv); return priv; } static struct j1939_priv *j1939_priv_get_by_ndev(struct net_device *ndev) { struct j1939_priv *priv; mutex_lock(&j1939_netdev_lock); priv = j1939_priv_get_by_ndev_locked(ndev); mutex_unlock(&j1939_netdev_lock); return priv; } struct j1939_priv *j1939_netdev_start(struct net_device *ndev) { struct j1939_priv *priv, *priv_new; int ret; mutex_lock(&j1939_netdev_lock); priv = j1939_priv_get_by_ndev_locked(ndev); if (priv) { kref_get(&priv->rx_kref); mutex_unlock(&j1939_netdev_lock); return priv; } mutex_unlock(&j1939_netdev_lock); priv = j1939_priv_create(ndev); if (!priv) return ERR_PTR(-ENOMEM); j1939_tp_init(priv); spin_lock_init(&priv->j1939_socks_lock); INIT_LIST_HEAD(&priv->j1939_socks); mutex_lock(&j1939_netdev_lock); priv_new = j1939_priv_get_by_ndev_locked(ndev); if (priv_new) { /* Someone was faster than us, use their priv and roll * back our's. */ kref_get(&priv_new->rx_kref); mutex_unlock(&j1939_netdev_lock); dev_put(ndev); kfree(priv); return priv_new; } j1939_priv_set(ndev, priv); ret = j1939_can_rx_register(priv); if (ret < 0) goto out_priv_put; mutex_unlock(&j1939_netdev_lock); return priv; out_priv_put: j1939_priv_set(ndev, NULL); mutex_unlock(&j1939_netdev_lock); dev_put(ndev); kfree(priv); return ERR_PTR(ret); } void j1939_netdev_stop(struct j1939_priv *priv) { kref_put_mutex(&priv->rx_kref, __j1939_rx_release, &j1939_netdev_lock); j1939_priv_put(priv); } int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb) { int ret, dlc; canid_t canid; struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct can_frame *cf; /* apply sanity checks */ if (j1939_pgn_is_pdu1(skcb->addr.pgn)) skcb->addr.pgn &= J1939_PGN_PDU1_MAX; else skcb->addr.pgn &= J1939_PGN_MAX; if (skcb->priority > 7) skcb->priority = 6; ret = j1939_ac_fixup(priv, skb); if (unlikely(ret)) goto failed; dlc = skb->len; /* re-claim the CAN_HDR from the SKB */ cf = skb_push(skb, J1939_CAN_HDR); /* initialize header structure */ memset(cf, 0, J1939_CAN_HDR); /* make it a full can frame again */ skb_put(skb, J1939_CAN_FTR + (8 - dlc)); canid = CAN_EFF_FLAG | (skcb->priority << 26) | (skcb->addr.pgn << 8) | skcb->addr.sa; if (j1939_pgn_is_pdu1(skcb->addr.pgn)) canid |= skcb->addr.da << 8; cf->can_id = canid; cf->len = dlc; return can_send(skb, 1); failed: kfree_skb(skb); return ret; } static int j1939_netdev_notify(struct notifier_block *nb, unsigned long msg, void *data) { struct net_device *ndev = netdev_notifier_info_to_dev(data); struct can_ml_priv *can_ml = can_get_ml_priv(ndev); struct j1939_priv *priv; if (!can_ml) goto notify_done; priv = j1939_priv_get_by_ndev(ndev); if (!priv) goto notify_done; switch (msg) { case NETDEV_DOWN: j1939_cancel_active_session(priv, NULL); j1939_sk_netdev_event_netdown(priv); j1939_ecu_unmap_all(priv); break; } j1939_priv_put(priv); notify_done: return NOTIFY_DONE; } static struct notifier_block j1939_netdev_notifier = { .notifier_call = j1939_netdev_notify, }; /* MODULE interface */ static __init int j1939_module_init(void) { int ret; pr_info("can: SAE J1939\n"); ret = register_netdevice_notifier(&j1939_netdev_notifier); if (ret) goto fail_notifier; ret = can_proto_register(&j1939_can_proto); if (ret < 0) { pr_err("can: registration of j1939 protocol failed\n"); goto fail_sk; } return 0; fail_sk: unregister_netdevice_notifier(&j1939_netdev_notifier); fail_notifier: return ret; } static __exit void j1939_module_exit(void) { can_proto_unregister(&j1939_can_proto); unregister_netdevice_notifier(&j1939_netdev_notifier); } module_init(j1939_module_init); module_exit(j1939_module_exit);
6 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 // SPDX-License-Identifier: GPL-2.0 #include <linux/net.h> #include <linux/uio.h> #include <net/sock.h> #include <linux/nospec.h> #include "rsrc.h" #define IO_NOTIF_UBUF_FLAGS (SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN) #define IO_NOTIF_SPLICE_BATCH 32 struct io_notif_data { struct file *file; struct ubuf_info uarg; unsigned long account_pages; bool zc_report; bool zc_used; bool zc_copied; }; struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx); void io_notif_set_extended(struct io_kiocb *notif); static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif) { return io_kiocb_to_cmd(notif, struct io_notif_data); } static inline void io_notif_flush(struct io_kiocb *notif) __must_hold(&notif->ctx->uring_lock) { struct io_notif_data *nd = io_notif_to_data(notif); /* drop slot's master ref */ if (refcount_dec_and_test(&nd->uarg.refcnt)) __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); } static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) { struct io_ring_ctx *ctx = notif->ctx; struct io_notif_data *nd = io_notif_to_data(notif); unsigned nr_pages = (len >> PAGE_SHIFT) + 2; int ret; if (ctx->user) { ret = __io_account_mem(ctx->user, nr_pages); if (ret) return ret; nd->account_pages += nr_pages; } return 0; }
1 1 1 1 1 36 18 22 18 3 24 24 7 1 2 2 1 1 2 2 1 1 5 5 13 11 2 12 1 2 11 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 // SPDX-License-Identifier: GPL-2.0-only /* * file.c * * PURPOSE * File handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * (C) 1998-1999 Dave Boynton * (C) 1998-2004 Ben Fennema * (C) 1999-2000 Stelias Computing Inc * * HISTORY * * 10/02/98 dgb Attempt to integrate into udf.o * 10/07/98 Switched to using generic_readpage, etc., like isofs * And it works! * 12/06/98 blf Added udf_file_read. uses generic_file_read for all cases but * ICBTAG_FLAG_AD_IN_ICB. * 04/06/99 64 bit file handling on 32 bit systems taken from ext2 file.c * 05/12/99 Preliminary file write support */ #include "udfdecl.h" #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/string.h> /* memset */ #include <linux/capability.h> #include <linux/errno.h> #include <linux/pagemap.h> #include <linux/uio.h> #include "udf_i.h" #include "udf_sb.h" static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); struct address_space *mapping = inode->i_mapping; struct page *page = vmf->page; loff_t size; unsigned int end; vm_fault_t ret = VM_FAULT_LOCKED; int err; sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); filemap_invalidate_lock_shared(mapping); lock_page(page); size = i_size_read(inode); if (page->mapping != inode->i_mapping || page_offset(page) >= size) { unlock_page(page); ret = VM_FAULT_NOPAGE; goto out_unlock; } /* Space is already allocated for in-ICB file */ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) goto out_dirty; if (page->index == size >> PAGE_SHIFT) end = size & ~PAGE_MASK; else end = PAGE_SIZE; err = __block_write_begin(page, 0, end, udf_get_block); if (err) { unlock_page(page); ret = vmf_fs_error(err); goto out_unlock; } block_commit_write(page, 0, end); out_dirty: set_page_dirty(page); wait_for_stable_page(page); out_unlock: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); return ret; } static const struct vm_operations_struct udf_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = udf_page_mkwrite, }; static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t retval; struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct udf_inode_info *iinfo = UDF_I(inode); inode_lock(inode); retval = generic_write_checks(iocb, from); if (retval <= 0) goto out; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && inode->i_sb->s_blocksize < (udf_file_entry_alloc_offset(inode) + iocb->ki_pos + iov_iter_count(from))) { filemap_invalidate_lock(inode->i_mapping); retval = udf_expand_file_adinicb(inode); filemap_invalidate_unlock(inode->i_mapping); if (retval) goto out; } retval = __generic_file_write_iter(iocb, from); out: if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && retval > 0) { down_write(&iinfo->i_data_sem); iinfo->i_lenAlloc = inode->i_size; up_write(&iinfo->i_data_sem); } inode_unlock(inode); if (retval > 0) { mark_inode_dirty(inode); retval = generic_write_sync(iocb, retval); } return retval; } long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); long old_block, new_block; int result; if (file_permission(filp, MAY_READ) != 0) { udf_debug("no permission to access inode %lu\n", inode->i_ino); return -EPERM; } if (!arg && ((cmd == UDF_GETVOLIDENT) || (cmd == UDF_GETEASIZE) || (cmd == UDF_RELOCATE_BLOCKS) || (cmd == UDF_GETEABLOCK))) { udf_debug("invalid argument to udf_ioctl\n"); return -EINVAL; } switch (cmd) { case UDF_GETVOLIDENT: if (copy_to_user((char __user *)arg, UDF_SB(inode->i_sb)->s_volume_ident, 32)) return -EFAULT; return 0; case UDF_RELOCATE_BLOCKS: if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(old_block, (long __user *)arg)) return -EFAULT; result = udf_relocate_blocks(inode->i_sb, old_block, &new_block); if (result == 0) result = put_user(new_block, (long __user *)arg); return result; case UDF_GETEASIZE: return put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg); case UDF_GETEABLOCK: return copy_to_user((char __user *)arg, UDF_I(inode)->i_data, UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0; default: return -ENOIOCTLCMD; } return 0; } static int udf_release_file(struct inode *inode, struct file *filp) { if (filp->f_mode & FMODE_WRITE && atomic_read(&inode->i_writecount) == 1) { /* * Grab i_mutex to avoid races with writes changing i_size * while we are running. */ inode_lock(inode); down_write(&UDF_I(inode)->i_data_sem); udf_discard_prealloc(inode); udf_truncate_tail_extent(inode); up_write(&UDF_I(inode)->i_data_sem); inode_unlock(inode); } return 0; } static int udf_file_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &udf_file_vm_ops; return 0; } const struct file_operations udf_file_operations = { .read_iter = generic_file_read_iter, .unlocked_ioctl = udf_ioctl, .open = generic_file_open, .mmap = udf_file_mmap, .write_iter = udf_file_write_iter, .release = udf_release_file, .fsync = generic_file_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, }; static int udf_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; int error; error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; if ((attr->ia_valid & ATTR_UID) && UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET) && !uid_eq(attr->ia_uid, UDF_SB(sb)->s_uid)) return -EPERM; if ((attr->ia_valid & ATTR_GID) && UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET) && !gid_eq(attr->ia_gid, UDF_SB(sb)->s_gid)) return -EPERM; if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { error = udf_setsize(inode, attr->ia_size); if (error) return error; } if (attr->ia_valid & ATTR_MODE) udf_update_extra_perms(inode, attr->ia_mode); setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } const struct inode_operations udf_file_inode_operations = { .setattr = udf_setattr, };
133 132 49 133 133 104 104 132 133 90 90 1 90 89 90 37 36 37 2 2 2 90 90 90 1 6 1 5 5 1 1 1 1 24 111 111 111 110 102 49 49 49 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * Copyright (c) 2018 Red Hat, Inc. * All rights reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_alloc_btree.h" #include "xfs_rmap_btree.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_rmap.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_health.h" #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_defer.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_trace.h" #include "xfs_inode.h" #include "xfs_icache.h" /* * Passive reference counting access wrappers to the perag structures. If the * per-ag structure is to be freed, the freeing code is responsible for cleaning * up objects with passive references before freeing the structure. This is * things like cached buffers. */ struct xfs_perag * xfs_perag_get( struct xfs_mount *mp, xfs_agnumber_t agno) { struct xfs_perag *pag; rcu_read_lock(); pag = radix_tree_lookup(&mp->m_perag_tree, agno); if (pag) { trace_xfs_perag_get(pag, _RET_IP_); ASSERT(atomic_read(&pag->pag_ref) >= 0); atomic_inc(&pag->pag_ref); } rcu_read_unlock(); return pag; } /* * search from @first to find the next perag with the given tag set. */ struct xfs_perag * xfs_perag_get_tag( struct xfs_mount *mp, xfs_agnumber_t first, unsigned int tag) { struct xfs_perag *pag; int found; rcu_read_lock(); found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, (void **)&pag, first, 1, tag); if (found <= 0) { rcu_read_unlock(); return NULL; } trace_xfs_perag_get_tag(pag, _RET_IP_); atomic_inc(&pag->pag_ref); rcu_read_unlock(); return pag; } /* Get a passive reference to the given perag. */ struct xfs_perag * xfs_perag_hold( struct xfs_perag *pag) { ASSERT(atomic_read(&pag->pag_ref) > 0 || atomic_read(&pag->pag_active_ref) > 0); trace_xfs_perag_hold(pag, _RET_IP_); atomic_inc(&pag->pag_ref); return pag; } void xfs_perag_put( struct xfs_perag *pag) { trace_xfs_perag_put(pag, _RET_IP_); ASSERT(atomic_read(&pag->pag_ref) > 0); atomic_dec(&pag->pag_ref); } /* * Active references for perag structures. This is for short term access to the * per ag structures for walking trees or accessing state. If an AG is being * shrunk or is offline, then this will fail to find that AG and return NULL * instead. */ struct xfs_perag * xfs_perag_grab( struct xfs_mount *mp, xfs_agnumber_t agno) { struct xfs_perag *pag; rcu_read_lock(); pag = radix_tree_lookup(&mp->m_perag_tree, agno); if (pag) { trace_xfs_perag_grab(pag, _RET_IP_); if (!atomic_inc_not_zero(&pag->pag_active_ref)) pag = NULL; } rcu_read_unlock(); return pag; } /* * search from @first to find the next perag with the given tag set. */ struct xfs_perag * xfs_perag_grab_tag( struct xfs_mount *mp, xfs_agnumber_t first, int tag) { struct xfs_perag *pag; int found; rcu_read_lock(); found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, (void **)&pag, first, 1, tag); if (found <= 0) { rcu_read_unlock(); return NULL; } trace_xfs_perag_grab_tag(pag, _RET_IP_); if (!atomic_inc_not_zero(&pag->pag_active_ref)) pag = NULL; rcu_read_unlock(); return pag; } void xfs_perag_rele( struct xfs_perag *pag) { trace_xfs_perag_rele(pag, _RET_IP_); if (atomic_dec_and_test(&pag->pag_active_ref)) wake_up(&pag->pag_active_wq); } /* * xfs_initialize_perag_data * * Read in each per-ag structure so we can count up the number of * allocated inodes, free inodes and used filesystem blocks as this * information is no longer persistent in the superblock. Once we have * this information, write it into the in-core superblock structure. */ int xfs_initialize_perag_data( struct xfs_mount *mp, xfs_agnumber_t agcount) { xfs_agnumber_t index; struct xfs_perag *pag; struct xfs_sb *sbp = &mp->m_sb; uint64_t ifree = 0; uint64_t ialloc = 0; uint64_t bfree = 0; uint64_t bfreelst = 0; uint64_t btree = 0; uint64_t fdblocks; int error = 0; for (index = 0; index < agcount; index++) { /* * Read the AGF and AGI buffers to populate the per-ag * structures for us. */ pag = xfs_perag_get(mp, index); error = xfs_alloc_read_agf(pag, NULL, 0, NULL); if (!error) error = xfs_ialloc_read_agi(pag, NULL, NULL); if (error) { xfs_perag_put(pag); return error; } ifree += pag->pagi_freecount; ialloc += pag->pagi_count; bfree += pag->pagf_freeblks; bfreelst += pag->pagf_flcount; btree += pag->pagf_btreeblks; xfs_perag_put(pag); } fdblocks = bfree + bfreelst + btree; /* * If the new summary counts are obviously incorrect, fail the * mount operation because that implies the AGFs are also corrupt. * Clear FS_COUNTERS so that we don't unmount with a dirty log, which * will prevent xfs_repair from fixing anything. */ if (fdblocks > sbp->sb_dblocks || ifree > ialloc) { xfs_alert(mp, "AGF corruption. Please run xfs_repair."); error = -EFSCORRUPTED; goto out; } /* Overwrite incore superblock counters with just-read data */ spin_lock(&mp->m_sb_lock); sbp->sb_ifree = ifree; sbp->sb_icount = ialloc; sbp->sb_fdblocks = fdblocks; spin_unlock(&mp->m_sb_lock); xfs_reinit_percpu_counters(mp); out: xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS); return error; } STATIC void __xfs_free_perag( struct rcu_head *head) { struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); kmem_free(pag); } /* * Free up the per-ag resources associated with the mount structure. */ void xfs_free_perag( struct xfs_mount *mp) { struct xfs_perag *pag; xfs_agnumber_t agno; for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { spin_lock(&mp->m_perag_lock); pag = radix_tree_delete(&mp->m_perag_tree, agno); spin_unlock(&mp->m_perag_lock); ASSERT(pag); XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); xfs_defer_drain_free(&pag->pag_intents_drain); cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_buf_hash_destroy(pag); /* drop the mount's active reference */ xfs_perag_rele(pag); XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_active_ref) != 0); call_rcu(&pag->rcu_head, __xfs_free_perag); } } /* Find the size of the AG, in blocks. */ static xfs_agblock_t __xfs_ag_block_count( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agnumber_t agcount, xfs_rfsblock_t dblocks) { ASSERT(agno < agcount); if (agno < agcount - 1) return mp->m_sb.sb_agblocks; return dblocks - (agno * mp->m_sb.sb_agblocks); } xfs_agblock_t xfs_ag_block_count( struct xfs_mount *mp, xfs_agnumber_t agno) { return __xfs_ag_block_count(mp, agno, mp->m_sb.sb_agcount, mp->m_sb.sb_dblocks); } /* Calculate the first and last possible inode number in an AG. */ static void __xfs_agino_range( struct xfs_mount *mp, xfs_agblock_t eoag, xfs_agino_t *first, xfs_agino_t *last) { xfs_agblock_t bno; /* * Calculate the first inode, which will be in the first * cluster-aligned block after the AGFL. */ bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align); *first = XFS_AGB_TO_AGINO(mp, bno); /* * Calculate the last inode, which will be at the end of the * last (aligned) cluster that can be allocated in the AG. */ bno = round_down(eoag, M_IGEO(mp)->cluster_align); *last = XFS_AGB_TO_AGINO(mp, bno) - 1; } void xfs_agino_range( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t *first, xfs_agino_t *last) { return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last); } /* * Free perag within the specified AG range, it is only used to free unused * perags under the error handling path. */ void xfs_free_unused_perag_range( struct xfs_mount *mp, xfs_agnumber_t agstart, xfs_agnumber_t agend) { struct xfs_perag *pag; xfs_agnumber_t index; for (index = agstart; index < agend; index++) { spin_lock(&mp->m_perag_lock); pag = radix_tree_delete(&mp->m_perag_tree, index); spin_unlock(&mp->m_perag_lock); if (!pag) break; xfs_buf_hash_destroy(pag); xfs_defer_drain_free(&pag->pag_intents_drain); kmem_free(pag); } } int xfs_initialize_perag( struct xfs_mount *mp, xfs_agnumber_t agcount, xfs_rfsblock_t dblocks, xfs_agnumber_t *maxagi) { struct xfs_perag *pag; xfs_agnumber_t index; xfs_agnumber_t first_initialised = NULLAGNUMBER; int error; /* * Walk the current per-ag tree so we don't try to initialise AGs * that already exist (growfs case). Allocate and insert all the * AGs we don't find ready for initialisation. */ for (index = 0; index < agcount; index++) { pag = xfs_perag_get(mp, index); if (pag) { xfs_perag_put(pag); continue; } pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); if (!pag) { error = -ENOMEM; goto out_unwind_new_pags; } pag->pag_agno = index; pag->pag_mount = mp; error = radix_tree_preload(GFP_NOFS); if (error) goto out_free_pag; spin_lock(&mp->m_perag_lock); if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { WARN_ON_ONCE(1); spin_unlock(&mp->m_perag_lock); radix_tree_preload_end(); error = -EEXIST; goto out_free_pag; } spin_unlock(&mp->m_perag_lock); radix_tree_preload_end(); #ifdef __KERNEL__ /* Place kernel structure only init below this point. */ spin_lock_init(&pag->pag_ici_lock); spin_lock_init(&pag->pagb_lock); spin_lock_init(&pag->pag_state_lock); INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); xfs_defer_drain_init(&pag->pag_intents_drain); init_waitqueue_head(&pag->pagb_wait); init_waitqueue_head(&pag->pag_active_wq); pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; #endif /* __KERNEL__ */ error = xfs_buf_hash_init(pag); if (error) goto out_remove_pag; /* Active ref owned by mount indicates AG is online. */ atomic_set(&pag->pag_active_ref, 1); /* first new pag is fully initialized */ if (first_initialised == NULLAGNUMBER) first_initialised = index; /* * Pre-calculated geometry */ pag->block_count = __xfs_ag_block_count(mp, index, agcount, dblocks); pag->min_block = XFS_AGFL_BLOCK(mp); __xfs_agino_range(mp, pag->block_count, &pag->agino_min, &pag->agino_max); } index = xfs_set_inode_alloc(mp, agcount); if (maxagi) *maxagi = index; mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); return 0; out_remove_pag: xfs_defer_drain_free(&pag->pag_intents_drain); spin_lock(&mp->m_perag_lock); radix_tree_delete(&mp->m_perag_tree, index); spin_unlock(&mp->m_perag_lock); out_free_pag: kmem_free(pag); out_unwind_new_pags: /* unwind any prior newly initialized pags */ xfs_free_unused_perag_range(mp, first_initialised, agcount); return error; } static int xfs_get_aghdr_buf( struct xfs_mount *mp, xfs_daddr_t blkno, size_t numblks, struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { struct xfs_buf *bp; int error; error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp); if (error) return error; bp->b_maps[0].bm_bn = blkno; bp->b_ops = ops; *bpp = bp; return 0; } /* * Generic btree root block init function */ static void xfs_btroot_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno); } /* Finish initializing a free space btree. */ static void xfs_freesp_init_recs( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { struct xfs_alloc_rec *arec; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); if (xfs_ag_contains_log(mp, id->agno)) { struct xfs_alloc_rec *nrec; xfs_agblock_t start = XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart); ASSERT(start >= mp->m_ag_prealloc_blocks); if (start != mp->m_ag_prealloc_blocks) { /* * Modify first record to pad stripe align of log and * bump the record count. */ arec->ar_blockcount = cpu_to_be32(start - mp->m_ag_prealloc_blocks); be16_add_cpu(&block->bb_numrecs, 1); nrec = arec + 1; /* * Insert second record at start of internal log * which then gets trimmed. */ nrec->ar_startblock = cpu_to_be32( be32_to_cpu(arec->ar_startblock) + be32_to_cpu(arec->ar_blockcount)); arec = nrec; } /* * Change record start to after the internal log */ be32_add_cpu(&arec->ar_startblock, mp->m_sb.sb_logblocks); } /* * Calculate the block count of this record; if it is nonzero, * increment the record count. */ arec->ar_blockcount = cpu_to_be32(id->agsize - be32_to_cpu(arec->ar_startblock)); if (arec->ar_blockcount) be16_add_cpu(&block->bb_numrecs, 1); } /* * Alloc btree root block init functions */ static void xfs_bnoroot_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 0, id->agno); xfs_freesp_init_recs(mp, bp, id); } static void xfs_cntroot_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 0, id->agno); xfs_freesp_init_recs(mp, bp, id); } /* * Reverse map root block init */ static void xfs_rmaproot_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_rmap_rec *rrec; xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno); /* * mark the AG header regions as static metadata The BNO * btree block is the first block after the headers, so * it's location defines the size of region the static * metadata consumes. * * Note: unlike mkfs, we never have to account for log * space when growing the data regions */ rrec = XFS_RMAP_REC_ADDR(block, 1); rrec->rm_startblock = 0; rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp)); rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS); rrec->rm_offset = 0; /* account freespace btree root blocks */ rrec = XFS_RMAP_REC_ADDR(block, 2); rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp)); rrec->rm_blockcount = cpu_to_be32(2); rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); rrec->rm_offset = 0; /* account inode btree root blocks */ rrec = XFS_RMAP_REC_ADDR(block, 3); rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp)); rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) - XFS_IBT_BLOCK(mp)); rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT); rrec->rm_offset = 0; /* account for rmap btree root */ rrec = XFS_RMAP_REC_ADDR(block, 4); rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp)); rrec->rm_blockcount = cpu_to_be32(1); rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); rrec->rm_offset = 0; /* account for refc btree root */ if (xfs_has_reflink(mp)) { rrec = XFS_RMAP_REC_ADDR(block, 5); rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp)); rrec->rm_blockcount = cpu_to_be32(1); rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC); rrec->rm_offset = 0; be16_add_cpu(&block->bb_numrecs, 1); } /* account for the log space */ if (xfs_ag_contains_log(mp, id->agno)) { rrec = XFS_RMAP_REC_ADDR(block, be16_to_cpu(block->bb_numrecs) + 1); rrec->rm_startblock = cpu_to_be32( XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart)); rrec->rm_blockcount = cpu_to_be32(mp->m_sb.sb_logblocks); rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_LOG); rrec->rm_offset = 0; be16_add_cpu(&block->bb_numrecs, 1); } } /* * Initialise new secondary superblocks with the pre-grow geometry, but mark * them as "in progress" so we know they haven't yet been activated. This will * get cleared when the update with the new geometry information is done after * changes to the primary are committed. This isn't strictly necessary, but we * get it for free with the delayed buffer write lists and it means we can tell * if a grow operation didn't complete properly after the fact. */ static void xfs_sbblock_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { struct xfs_dsb *dsb = bp->b_addr; xfs_sb_to_disk(dsb, &mp->m_sb); dsb->sb_inprogress = 1; } static void xfs_agfblock_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { struct xfs_agf *agf = bp->b_addr; xfs_extlen_t tmpsize; agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); agf->agf_seqno = cpu_to_be32(id->agno); agf->agf_length = cpu_to_be32(id->agsize); agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp)); agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); if (xfs_has_rmapbt(mp)) { agf->agf_roots[XFS_BTNUM_RMAPi] = cpu_to_be32(XFS_RMAP_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); agf->agf_rmap_blocks = cpu_to_be32(1); } agf->agf_flfirst = cpu_to_be32(1); agf->agf_fllast = 0; agf->agf_flcount = 0; tmpsize = id->agsize - mp->m_ag_prealloc_blocks; agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); if (xfs_has_crc(mp)) uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); if (xfs_has_reflink(mp)) { agf->agf_refcount_root = cpu_to_be32( xfs_refc_block(mp)); agf->agf_refcount_level = cpu_to_be32(1); agf->agf_refcount_blocks = cpu_to_be32(1); } if (xfs_ag_contains_log(mp, id->agno)) { int64_t logblocks = mp->m_sb.sb_logblocks; be32_add_cpu(&agf->agf_freeblks, -logblocks); agf->agf_longest = cpu_to_be32(id->agsize - XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart) - logblocks); } } static void xfs_agflblock_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); __be32 *agfl_bno; int bucket; if (xfs_has_crc(mp)) { agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); agfl->agfl_seqno = cpu_to_be32(id->agno); uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); } agfl_bno = xfs_buf_to_agfl_bno(bp); for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++) agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); } static void xfs_agiblock_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { struct xfs_agi *agi = bp->b_addr; int bucket; agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); agi->agi_seqno = cpu_to_be32(id->agno); agi->agi_length = cpu_to_be32(id->agsize); agi->agi_count = 0; agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp)); agi->agi_level = cpu_to_be32(1); agi->agi_freecount = 0; agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); if (xfs_has_crc(mp)) uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); if (xfs_has_finobt(mp)) { agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); agi->agi_free_level = cpu_to_be32(1); } for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); if (xfs_has_inobtcounts(mp)) { agi->agi_iblocks = cpu_to_be32(1); if (xfs_has_finobt(mp)) agi->agi_fblocks = cpu_to_be32(1); } } typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id); static int xfs_ag_init_hdr( struct xfs_mount *mp, struct aghdr_init_data *id, aghdr_init_work_f work, const struct xfs_buf_ops *ops) { struct xfs_buf *bp; int error; error = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, &bp, ops); if (error) return error; (*work)(mp, bp, id); xfs_buf_delwri_queue(bp, &id->buffer_list); xfs_buf_relse(bp); return 0; } struct xfs_aghdr_grow_data { xfs_daddr_t daddr; size_t numblks; const struct xfs_buf_ops *ops; aghdr_init_work_f work; xfs_btnum_t type; bool need_init; }; /* * Prepare new AG headers to be written to disk. We use uncached buffers here, * as it is assumed these new AG headers are currently beyond the currently * valid filesystem address space. Using cached buffers would trip over EOFS * corruption detection alogrithms in the buffer cache lookup routines. * * This is a non-transactional function, but the prepared buffers are added to a * delayed write buffer list supplied by the caller so they can submit them to * disk and wait on them as required. */ int xfs_ag_init_headers( struct xfs_mount *mp, struct aghdr_init_data *id) { struct xfs_aghdr_grow_data aghdr_data[] = { { /* SB */ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_SB_DADDR), .numblks = XFS_FSS_TO_BB(mp, 1), .ops = &xfs_sb_buf_ops, .work = &xfs_sbblock_init, .need_init = true }, { /* AGF */ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGF_DADDR(mp)), .numblks = XFS_FSS_TO_BB(mp, 1), .ops = &xfs_agf_buf_ops, .work = &xfs_agfblock_init, .need_init = true }, { /* AGFL */ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGFL_DADDR(mp)), .numblks = XFS_FSS_TO_BB(mp, 1), .ops = &xfs_agfl_buf_ops, .work = &xfs_agflblock_init, .need_init = true }, { /* AGI */ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGI_DADDR(mp)), .numblks = XFS_FSS_TO_BB(mp, 1), .ops = &xfs_agi_buf_ops, .work = &xfs_agiblock_init, .need_init = true }, { /* BNO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_bnobt_buf_ops, .work = &xfs_bnoroot_init, .need_init = true }, { /* CNT root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_cntbt_buf_ops, .work = &xfs_cntroot_init, .need_init = true }, { /* INO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_IBT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_inobt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_INO, .need_init = true }, { /* FINO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_finobt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_FINO, .need_init = xfs_has_finobt(mp) }, { /* RMAP root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_rmapbt_buf_ops, .work = &xfs_rmaproot_init, .need_init = xfs_has_rmapbt(mp) }, { /* REFC root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_refcountbt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_REFC, .need_init = xfs_has_reflink(mp) }, { /* NULL terminating block */ .daddr = XFS_BUF_DADDR_NULL, } }; struct xfs_aghdr_grow_data *dp; int error = 0; /* Account for AG free space in new AG */ id->nfree += id->agsize - mp->m_ag_prealloc_blocks; for (dp = &aghdr_data[0]; dp->daddr != XFS_BUF_DADDR_NULL; dp++) { if (!dp->need_init) continue; id->daddr = dp->daddr; id->numblks = dp->numblks; id->type = dp->type; error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops); if (error) break; } return error; } int xfs_ag_shrink_space( struct xfs_perag *pag, struct xfs_trans **tpp, xfs_extlen_t delta) { struct xfs_mount *mp = pag->pag_mount; struct xfs_alloc_arg args = { .tp = *tpp, .mp = mp, .pag = pag, .minlen = delta, .maxlen = delta, .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, .resv = XFS_AG_RESV_NONE, .prod = 1 }; struct xfs_buf *agibp, *agfbp; struct xfs_agi *agi; struct xfs_agf *agf; xfs_agblock_t aglen; int error, err2; ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1); error = xfs_ialloc_read_agi(pag, *tpp, &agibp); if (error) return error; agi = agibp->b_addr; error = xfs_alloc_read_agf(pag, *tpp, 0, &agfbp); if (error) return error; agf = agfbp->b_addr; aglen = be32_to_cpu(agi->agi_length); /* some extra paranoid checks before we shrink the ag */ if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length)) return -EFSCORRUPTED; if (delta >= aglen) return -EINVAL; /* * Make sure that the last inode cluster cannot overlap with the new * end of the AG, even if it's sparse. */ error = xfs_ialloc_check_shrink(pag, *tpp, agibp, aglen - delta); if (error) return error; /* * Disable perag reservations so it doesn't cause the allocation request * to fail. We'll reestablish reservation before we return. */ error = xfs_ag_resv_free(pag); if (error) return error; /* internal log shouldn't also show up in the free space btrees */ error = xfs_alloc_vextent_exact_bno(&args, XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta)); if (!error && args.agbno == NULLAGBLOCK) error = -ENOSPC; if (error) { /* * if extent allocation fails, need to roll the transaction to * ensure that the AGFL fixup has been committed anyway. */ xfs_trans_bhold(*tpp, agfbp); err2 = xfs_trans_roll(tpp); if (err2) return err2; xfs_trans_bjoin(*tpp, agfbp); goto resv_init_out; } /* * if successfully deleted from freespace btrees, need to confirm * per-AG reservation works as expected. */ be32_add_cpu(&agi->agi_length, -delta); be32_add_cpu(&agf->agf_length, -delta); err2 = xfs_ag_resv_init(pag, *tpp); if (err2) { be32_add_cpu(&agi->agi_length, delta); be32_add_cpu(&agf->agf_length, delta); if (err2 != -ENOSPC) goto resv_err; err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, XFS_AG_RESV_NONE, true); if (err2) goto resv_err; /* * Roll the transaction before trying to re-init the per-ag * reservation. The new transaction is clean so it will cancel * without any side effects. */ error = xfs_defer_finish(tpp); if (error) return error; error = -ENOSPC; goto resv_init_out; } /* Update perag geometry */ pag->block_count -= delta; __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, &pag->agino_max); xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH); xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH); return 0; resv_init_out: err2 = xfs_ag_resv_init(pag, *tpp); if (!err2) return error; resv_err: xfs_warn(mp, "Error %d reserving per-AG metadata reserve pool.", err2); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return err2; } /* * Extent the AG indicated by the @id by the length passed in */ int xfs_ag_extend_space( struct xfs_perag *pag, struct xfs_trans *tp, xfs_extlen_t len) { struct xfs_buf *bp; struct xfs_agi *agi; struct xfs_agf *agf; int error; ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1); error = xfs_ialloc_read_agi(pag, tp, &bp); if (error) return error; agi = bp->b_addr; be32_add_cpu(&agi->agi_length, len); xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); /* * Change agf length. */ error = xfs_alloc_read_agf(pag, tp, 0, &bp); if (error) return error; agf = bp->b_addr; be32_add_cpu(&agf->agf_length, len); ASSERT(agf->agf_length == agi->agi_length); xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); /* * Free the new space. * * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that * this doesn't actually exist in the rmap btree. */ error = xfs_rmap_free(tp, bp, pag, be32_to_cpu(agf->agf_length) - len, len, &XFS_RMAP_OINFO_SKIP_UPDATE); if (error) return error; error = xfs_free_extent(tp, pag, be32_to_cpu(agf->agf_length) - len, len, &XFS_RMAP_OINFO_SKIP_UPDATE, XFS_AG_RESV_NONE); if (error) return error; /* Update perag geometry */ pag->block_count = be32_to_cpu(agf->agf_length); __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, &pag->agino_max); return 0; } /* Retrieve AG geometry. */ int xfs_ag_get_geometry( struct xfs_perag *pag, struct xfs_ag_geometry *ageo) { struct xfs_buf *agi_bp; struct xfs_buf *agf_bp; struct xfs_agi *agi; struct xfs_agf *agf; unsigned int freeblks; int error; /* Lock the AG headers. */ error = xfs_ialloc_read_agi(pag, NULL, &agi_bp); if (error) return error; error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp); if (error) goto out_agi; /* Fill out form. */ memset(ageo, 0, sizeof(*ageo)); ageo->ag_number = pag->pag_agno; agi = agi_bp->b_addr; ageo->ag_icount = be32_to_cpu(agi->agi_count); ageo->ag_ifree = be32_to_cpu(agi->agi_freecount); agf = agf_bp->b_addr; ageo->ag_length = be32_to_cpu(agf->agf_length); freeblks = pag->pagf_freeblks + pag->pagf_flcount + pag->pagf_btreeblks - xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE); ageo->ag_freeblks = freeblks; xfs_ag_geom_health(pag, ageo); /* Release resources. */ xfs_buf_relse(agf_bp); out_agi: xfs_buf_relse(agi_bp); return error; }
1 1 5 5 6 5 1 2 2 19 14 2 3 3 2 5 1 2 6 2 5 1 4 1 7 2 1 12 25 2 8 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 // SPDX-License-Identifier: GPL-2.0 #include <linux/cred.h> #include <linux/device.h> #include <linux/dma-buf.h> #include <linux/dma-resv.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/memfd.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/shmem_fs.h> #include <linux/slab.h> #include <linux/udmabuf.h> #include <linux/vmalloc.h> #include <linux/iosys-map.h> static int list_limit = 1024; module_param(list_limit, int, 0644); MODULE_PARM_DESC(list_limit, "udmabuf_create_list->count limit. Default is 1024."); static int size_limit_mb = 64; module_param(size_limit_mb, int, 0644); MODULE_PARM_DESC(size_limit_mb, "Max size of a dmabuf, in megabytes. Default is 64."); struct udmabuf { pgoff_t pagecount; struct page **pages; struct sg_table *sg; struct miscdevice *device; }; static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct udmabuf *ubuf = vma->vm_private_data; pgoff_t pgoff = vmf->pgoff; if (pgoff >= ubuf->pagecount) return VM_FAULT_SIGBUS; vmf->page = ubuf->pages[pgoff]; get_page(vmf->page); return 0; } static const struct vm_operations_struct udmabuf_vm_ops = { .fault = udmabuf_vm_fault, }; static int mmap_udmabuf(struct dma_buf *buf, struct vm_area_struct *vma) { struct udmabuf *ubuf = buf->priv; if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; vma->vm_ops = &udmabuf_vm_ops; vma->vm_private_data = ubuf; return 0; } static int vmap_udmabuf(struct dma_buf *buf, struct iosys_map *map) { struct udmabuf *ubuf = buf->priv; void *vaddr; dma_resv_assert_held(buf->resv); vaddr = vm_map_ram(ubuf->pages, ubuf->pagecount, -1); if (!vaddr) return -EINVAL; iosys_map_set_vaddr(map, vaddr); return 0; } static void vunmap_udmabuf(struct dma_buf *buf, struct iosys_map *map) { struct udmabuf *ubuf = buf->priv; dma_resv_assert_held(buf->resv); vm_unmap_ram(map->vaddr, ubuf->pagecount); } static struct sg_table *get_sg_table(struct device *dev, struct dma_buf *buf, enum dma_data_direction direction) { struct udmabuf *ubuf = buf->priv; struct sg_table *sg; int ret; sg = kzalloc(sizeof(*sg), GFP_KERNEL); if (!sg) return ERR_PTR(-ENOMEM); ret = sg_alloc_table_from_pages(sg, ubuf->pages, ubuf->pagecount, 0, ubuf->pagecount << PAGE_SHIFT, GFP_KERNEL); if (ret < 0) goto err; ret = dma_map_sgtable(dev, sg, direction, 0); if (ret < 0) goto err; return sg; err: sg_free_table(sg); kfree(sg); return ERR_PTR(ret); } static void put_sg_table(struct device *dev, struct sg_table *sg, enum dma_data_direction direction) { dma_unmap_sgtable(dev, sg, direction, 0); sg_free_table(sg); kfree(sg); } static struct sg_table *map_udmabuf(struct dma_buf_attachment *at, enum dma_data_direction direction) { return get_sg_table(at->dev, at->dmabuf, direction); } static void unmap_udmabuf(struct dma_buf_attachment *at, struct sg_table *sg, enum dma_data_direction direction) { return put_sg_table(at->dev, sg, direction); } static void release_udmabuf(struct dma_buf *buf) { struct udmabuf *ubuf = buf->priv; struct device *dev = ubuf->device->this_device; pgoff_t pg; if (ubuf->sg) put_sg_table(dev, ubuf->sg, DMA_BIDIRECTIONAL); for (pg = 0; pg < ubuf->pagecount; pg++) put_page(ubuf->pages[pg]); kfree(ubuf->pages); kfree(ubuf); } static int begin_cpu_udmabuf(struct dma_buf *buf, enum dma_data_direction direction) { struct udmabuf *ubuf = buf->priv; struct device *dev = ubuf->device->this_device; int ret = 0; if (!ubuf->sg) { ubuf->sg = get_sg_table(dev, buf, direction); if (IS_ERR(ubuf->sg)) { ret = PTR_ERR(ubuf->sg); ubuf->sg = NULL; } } else { dma_sync_sg_for_cpu(dev, ubuf->sg->sgl, ubuf->sg->nents, direction); } return ret; } static int end_cpu_udmabuf(struct dma_buf *buf, enum dma_data_direction direction) { struct udmabuf *ubuf = buf->priv; struct device *dev = ubuf->device->this_device; if (!ubuf->sg) return -EINVAL; dma_sync_sg_for_device(dev, ubuf->sg->sgl, ubuf->sg->nents, direction); return 0; } static const struct dma_buf_ops udmabuf_ops = { .cache_sgt_mapping = true, .map_dma_buf = map_udmabuf, .unmap_dma_buf = unmap_udmabuf, .release = release_udmabuf, .mmap = mmap_udmabuf, .vmap = vmap_udmabuf, .vunmap = vunmap_udmabuf, .begin_cpu_access = begin_cpu_udmabuf, .end_cpu_access = end_cpu_udmabuf, }; #define SEALS_WANTED (F_SEAL_SHRINK) #define SEALS_DENIED (F_SEAL_WRITE) static long udmabuf_create(struct miscdevice *device, struct udmabuf_create_list *head, struct udmabuf_create_item *list) { DEFINE_DMA_BUF_EXPORT_INFO(exp_info); struct file *memfd = NULL; struct address_space *mapping = NULL; struct udmabuf *ubuf; struct dma_buf *buf; pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit; struct page *page; int seals, ret = -EINVAL; u32 i, flags; ubuf = kzalloc(sizeof(*ubuf), GFP_KERNEL); if (!ubuf) return -ENOMEM; pglimit = (size_limit_mb * 1024 * 1024) >> PAGE_SHIFT; for (i = 0; i < head->count; i++) { if (!IS_ALIGNED(list[i].offset, PAGE_SIZE)) goto err; if (!IS_ALIGNED(list[i].size, PAGE_SIZE)) goto err; ubuf->pagecount += list[i].size >> PAGE_SHIFT; if (ubuf->pagecount > pglimit) goto err; } if (!ubuf->pagecount) goto err; ubuf->pages = kmalloc_array(ubuf->pagecount, sizeof(*ubuf->pages), GFP_KERNEL); if (!ubuf->pages) { ret = -ENOMEM; goto err; } pgbuf = 0; for (i = 0; i < head->count; i++) { ret = -EBADFD; memfd = fget(list[i].memfd); if (!memfd) goto err; mapping = memfd->f_mapping; if (!shmem_mapping(mapping)) goto err; seals = memfd_fcntl(memfd, F_GET_SEALS, 0); if (seals == -EINVAL) goto err; ret = -EINVAL; if ((seals & SEALS_WANTED) != SEALS_WANTED || (seals & SEALS_DENIED) != 0) goto err; pgoff = list[i].offset >> PAGE_SHIFT; pgcnt = list[i].size >> PAGE_SHIFT; for (pgidx = 0; pgidx < pgcnt; pgidx++) { page = shmem_read_mapping_page(mapping, pgoff + pgidx); if (IS_ERR(page)) { ret = PTR_ERR(page); goto err; } ubuf->pages[pgbuf++] = page; } fput(memfd); memfd = NULL; } exp_info.ops = &udmabuf_ops; exp_info.size = ubuf->pagecount << PAGE_SHIFT; exp_info.priv = ubuf; exp_info.flags = O_RDWR; ubuf->device = device; buf = dma_buf_export(&exp_info); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto err; } flags = 0; if (head->flags & UDMABUF_FLAGS_CLOEXEC) flags |= O_CLOEXEC; return dma_buf_fd(buf, flags); err: while (pgbuf > 0) put_page(ubuf->pages[--pgbuf]); if (memfd) fput(memfd); kfree(ubuf->pages); kfree(ubuf); return ret; } static long udmabuf_ioctl_create(struct file *filp, unsigned long arg) { struct udmabuf_create create; struct udmabuf_create_list head; struct udmabuf_create_item list; if (copy_from_user(&create, (void __user *)arg, sizeof(create))) return -EFAULT; head.flags = create.flags; head.count = 1; list.memfd = create.memfd; list.offset = create.offset; list.size = create.size; return udmabuf_create(filp->private_data, &head, &list); } static long udmabuf_ioctl_create_list(struct file *filp, unsigned long arg) { struct udmabuf_create_list head; struct udmabuf_create_item *list; int ret = -EINVAL; u32 lsize; if (copy_from_user(&head, (void __user *)arg, sizeof(head))) return -EFAULT; if (head.count > list_limit) return -EINVAL; lsize = sizeof(struct udmabuf_create_item) * head.count; list = memdup_user((void __user *)(arg + sizeof(head)), lsize); if (IS_ERR(list)) return PTR_ERR(list); ret = udmabuf_create(filp->private_data, &head, list); kfree(list); return ret; } static long udmabuf_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { long ret; switch (ioctl) { case UDMABUF_CREATE: ret = udmabuf_ioctl_create(filp, arg); break; case UDMABUF_CREATE_LIST: ret = udmabuf_ioctl_create_list(filp, arg); break; default: ret = -ENOTTY; break; } return ret; } static const struct file_operations udmabuf_fops = { .owner = THIS_MODULE, .unlocked_ioctl = udmabuf_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = udmabuf_ioctl, #endif }; static struct miscdevice udmabuf_misc = { .minor = MISC_DYNAMIC_MINOR, .name = "udmabuf", .fops = &udmabuf_fops, }; static int __init udmabuf_dev_init(void) { int ret; ret = misc_register(&udmabuf_misc); if (ret < 0) { pr_err("Could not initialize udmabuf device\n"); return ret; } ret = dma_coerce_mask_and_coherent(udmabuf_misc.this_device, DMA_BIT_MASK(64)); if (ret < 0) { pr_err("Could not setup DMA mask for udmabuf device\n"); misc_deregister(&udmabuf_misc); return ret; } return 0; } static void __exit udmabuf_dev_exit(void) { misc_deregister(&udmabuf_misc); } module_init(udmabuf_dev_init) module_exit(udmabuf_dev_exit) MODULE_AUTHOR("Gerd Hoffmann <kraxel@redhat.com>");
1591 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_SCSI_CMND_H #define _SCSI_SCSI_CMND_H #include <linux/dma-mapping.h> #include <linux/blkdev.h> #include <linux/t10-pi.h> #include <linux/list.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/scatterlist.h> #include <scsi/scsi_device.h> struct Scsi_Host; /* * MAX_COMMAND_SIZE is: * The longest fixed-length SCSI CDB as per the SCSI standard. * fixed-length means: commands that their size can be determined * by their opcode and the CDB does not carry a length specifier, (unlike * the VARIABLE_LENGTH_CMD(0x7f) command). This is actually not exactly * true and the SCSI standard also defines extended commands and * vendor specific commands that can be bigger than 16 bytes. The kernel * will support these using the same infrastructure used for VARLEN CDB's. * So in effect MAX_COMMAND_SIZE means the maximum size command scsi-ml * supports without specifying a cmd_len by ULD's */ #define MAX_COMMAND_SIZE 16 struct scsi_data_buffer { struct sg_table table; unsigned length; }; /* embedded in scsi_cmnd */ struct scsi_pointer { char *ptr; /* data pointer */ int this_residual; /* left in this buffer */ struct scatterlist *buffer; /* which buffer */ int buffers_residual; /* how many buffers left */ dma_addr_t dma_handle; volatile int Status; volatile int Message; volatile int have_data_in; volatile int sent_command; volatile int phase; }; /* for scmd->flags */ #define SCMD_TAGGED (1 << 0) #define SCMD_INITIALIZED (1 << 1) #define SCMD_LAST (1 << 2) /* * libata uses SCSI EH to fetch sense data for successful commands. * SCSI EH should not overwrite scmd->result when SCMD_FORCE_EH_SUCCESS is set. */ #define SCMD_FORCE_EH_SUCCESS (1 << 3) #define SCMD_FAIL_IF_RECOVERING (1 << 4) /* flags preserved across unprep / reprep */ #define SCMD_PRESERVED_FLAGS (SCMD_INITIALIZED | SCMD_FAIL_IF_RECOVERING) /* for scmd->state */ #define SCMD_STATE_COMPLETE 0 #define SCMD_STATE_INFLIGHT 1 enum scsi_cmnd_submitter { SUBMITTED_BY_BLOCK_LAYER = 0, SUBMITTED_BY_SCSI_ERROR_HANDLER = 1, SUBMITTED_BY_SCSI_RESET_IOCTL = 2, } __packed; struct scsi_cmnd { struct scsi_device *device; struct list_head eh_entry; /* entry for the host eh_abort_list/eh_cmd_q */ struct delayed_work abort_work; struct rcu_head rcu; int eh_eflags; /* Used by error handlr */ int budget_token; /* * This is set to jiffies as it was when the command was first * allocated. It is used to time how long the command has * been outstanding */ unsigned long jiffies_at_alloc; int retries; int allowed; unsigned char prot_op; unsigned char prot_type; unsigned char prot_flags; enum scsi_cmnd_submitter submitter; unsigned short cmd_len; enum dma_data_direction sc_data_direction; unsigned char cmnd[32]; /* SCSI CDB */ /* These elements define the operation we ultimately want to perform */ struct scsi_data_buffer sdb; struct scsi_data_buffer *prot_sdb; unsigned underflow; /* Return error if less than this amount is transferred */ unsigned transfersize; /* How much we are guaranteed to transfer with each SCSI transfer (ie, between disconnect / reconnects. Probably == sector size */ unsigned resid_len; /* residual count */ unsigned sense_len; unsigned char *sense_buffer; /* obtained by REQUEST SENSE when * CHECK CONDITION is received on original * command (auto-sense). Length must be * SCSI_SENSE_BUFFERSIZE bytes. */ int flags; /* Command flags */ unsigned long state; /* Command completion state */ unsigned int extra_len; /* length of alignment and padding */ /* * The fields below can be modified by the LLD but the fields above * must not be modified. */ unsigned char *host_scribble; /* The host adapter is allowed to * call scsi_malloc and get some memory * and hang it here. The host adapter * is also expected to call scsi_free * to release this memory. (The memory * obtained by scsi_malloc is guaranteed * to be at an address < 16Mb). */ int result; /* Status code from lower level driver */ }; /* Variant of blk_mq_rq_from_pdu() that verifies the type of its argument. */ static inline struct request *scsi_cmd_to_rq(struct scsi_cmnd *scmd) { return blk_mq_rq_from_pdu(scmd); } /* * Return the driver private allocation behind the command. * Only works if cmd_size is set in the host template. */ static inline void *scsi_cmd_priv(struct scsi_cmnd *cmd) { return cmd + 1; } void scsi_done(struct scsi_cmnd *cmd); void scsi_done_direct(struct scsi_cmnd *cmd); extern void scsi_finish_command(struct scsi_cmnd *cmd); extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count, size_t *offset, size_t *len); extern void scsi_kunmap_atomic_sg(void *virt); blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd); void scsi_free_sgtables(struct scsi_cmnd *cmd); #ifdef CONFIG_SCSI_DMA extern int scsi_dma_map(struct scsi_cmnd *cmd); extern void scsi_dma_unmap(struct scsi_cmnd *cmd); #else /* !CONFIG_SCSI_DMA */ static inline int scsi_dma_map(struct scsi_cmnd *cmd) { return -ENOSYS; } static inline void scsi_dma_unmap(struct scsi_cmnd *cmd) { } #endif /* !CONFIG_SCSI_DMA */ static inline unsigned scsi_sg_count(struct scsi_cmnd *cmd) { return cmd->sdb.table.nents; } static inline struct scatterlist *scsi_sglist(struct scsi_cmnd *cmd) { return cmd->sdb.table.sgl; } static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd) { return cmd->sdb.length; } static inline void scsi_set_resid(struct scsi_cmnd *cmd, unsigned int resid) { cmd->resid_len = resid; } static inline unsigned int scsi_get_resid(struct scsi_cmnd *cmd) { return cmd->resid_len; } #define scsi_for_each_sg(cmd, sg, nseg, __i) \ for_each_sg(scsi_sglist(cmd), sg, nseg, __i) static inline int scsi_sg_copy_from_buffer(struct scsi_cmnd *cmd, const void *buf, int buflen) { return sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, buflen); } static inline int scsi_sg_copy_to_buffer(struct scsi_cmnd *cmd, void *buf, int buflen) { return sg_copy_to_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, buflen); } static inline sector_t scsi_get_sector(struct scsi_cmnd *scmd) { return blk_rq_pos(scsi_cmd_to_rq(scmd)); } static inline sector_t scsi_get_lba(struct scsi_cmnd *scmd) { unsigned int shift = ilog2(scmd->device->sector_size) - SECTOR_SHIFT; return blk_rq_pos(scsi_cmd_to_rq(scmd)) >> shift; } static inline unsigned int scsi_logical_block_count(struct scsi_cmnd *scmd) { unsigned int shift = ilog2(scmd->device->sector_size) - SECTOR_SHIFT; return blk_rq_bytes(scsi_cmd_to_rq(scmd)) >> shift; } /* * The operations below are hints that tell the controller driver how * to handle I/Os with DIF or similar types of protection information. */ enum scsi_prot_operations { /* Normal I/O */ SCSI_PROT_NORMAL = 0, /* OS-HBA: Protected, HBA-Target: Unprotected */ SCSI_PROT_READ_INSERT, SCSI_PROT_WRITE_STRIP, /* OS-HBA: Unprotected, HBA-Target: Protected */ SCSI_PROT_READ_STRIP, SCSI_PROT_WRITE_INSERT, /* OS-HBA: Protected, HBA-Target: Protected */ SCSI_PROT_READ_PASS, SCSI_PROT_WRITE_PASS, }; static inline void scsi_set_prot_op(struct scsi_cmnd *scmd, unsigned char op) { scmd->prot_op = op; } static inline unsigned char scsi_get_prot_op(struct scsi_cmnd *scmd) { return scmd->prot_op; } enum scsi_prot_flags { SCSI_PROT_TRANSFER_PI = 1 << 0, SCSI_PROT_GUARD_CHECK = 1 << 1, SCSI_PROT_REF_CHECK = 1 << 2, SCSI_PROT_REF_INCREMENT = 1 << 3, SCSI_PROT_IP_CHECKSUM = 1 << 4, }; /* * The controller usually does not know anything about the target it * is communicating with. However, when DIX is enabled the controller * must be know target type so it can verify the protection * information passed along with the I/O. */ enum scsi_prot_target_type { SCSI_PROT_DIF_TYPE0 = 0, SCSI_PROT_DIF_TYPE1, SCSI_PROT_DIF_TYPE2, SCSI_PROT_DIF_TYPE3, }; static inline void scsi_set_prot_type(struct scsi_cmnd *scmd, unsigned char type) { scmd->prot_type = type; } static inline unsigned char scsi_get_prot_type(struct scsi_cmnd *scmd) { return scmd->prot_type; } static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd) { struct request *rq = blk_mq_rq_from_pdu(scmd); return t10_pi_ref_tag(rq); } static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd) { return scmd->device->sector_size; } static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd) { return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0; } static inline struct scatterlist *scsi_prot_sglist(struct scsi_cmnd *cmd) { return cmd->prot_sdb ? cmd->prot_sdb->table.sgl : NULL; } static inline struct scsi_data_buffer *scsi_prot(struct scsi_cmnd *cmd) { return cmd->prot_sdb; } #define scsi_for_each_prot_sg(cmd, sg, nseg, __i) \ for_each_sg(scsi_prot_sglist(cmd), sg, nseg, __i) static inline void set_status_byte(struct scsi_cmnd *cmd, char status) { cmd->result = (cmd->result & 0xffffff00) | status; } static inline u8 get_status_byte(struct scsi_cmnd *cmd) { return cmd->result & 0xff; } static inline void set_host_byte(struct scsi_cmnd *cmd, char status) { cmd->result = (cmd->result & 0xff00ffff) | (status << 16); } static inline u8 get_host_byte(struct scsi_cmnd *cmd) { return (cmd->result >> 16) & 0xff; } /** * scsi_msg_to_host_byte() - translate message byte * * Translate the SCSI parallel message byte to a matching * host byte setting. A message of COMMAND_COMPLETE indicates * a successful command execution, any other message indicate * an error. As the messages themselves only have a meaning * for the SCSI parallel protocol this function translates * them into a matching host byte value for SCSI EH. */ static inline void scsi_msg_to_host_byte(struct scsi_cmnd *cmd, u8 msg) { switch (msg) { case COMMAND_COMPLETE: break; case ABORT_TASK_SET: set_host_byte(cmd, DID_ABORT); break; case TARGET_RESET: set_host_byte(cmd, DID_RESET); break; default: set_host_byte(cmd, DID_ERROR); break; } } static inline unsigned scsi_transfer_length(struct scsi_cmnd *scmd) { unsigned int xfer_len = scmd->sdb.length; unsigned int prot_interval = scsi_prot_interval(scmd); if (scmd->prot_flags & SCSI_PROT_TRANSFER_PI) xfer_len += (xfer_len >> ilog2(prot_interval)) * 8; return xfer_len; } extern void scsi_build_sense(struct scsi_cmnd *scmd, int desc, u8 key, u8 asc, u8 ascq); struct request *scsi_alloc_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags); #endif /* _SCSI_SCSI_CMND_H */
25 3 3 16 4 9 20 3 7 19 2 1 2 14 23 4 6 13 19 11 1 10 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Facebook */ #include <linux/slab.h> #include <linux/bpf.h> #include <linux/btf.h> #include "map_in_map.h" struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) { struct bpf_map *inner_map, *inner_map_meta; u32 inner_map_meta_size; struct fd f; int ret; f = fdget(inner_map_ufd); inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) return inner_map; /* Does not support >1 level map-in-map */ if (inner_map->inner_map_meta) { ret = -EINVAL; goto put; } if (!inner_map->ops->map_meta_equal) { ret = -ENOTSUPP; goto put; } inner_map_meta_size = sizeof(*inner_map_meta); /* In some cases verifier needs to access beyond just base map. */ if (inner_map->ops == &array_map_ops) inner_map_meta_size = sizeof(struct bpf_array); inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); if (!inner_map_meta) { ret = -ENOMEM; goto put; } inner_map_meta->map_type = inner_map->map_type; inner_map_meta->key_size = inner_map->key_size; inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; inner_map_meta->max_entries = inner_map->max_entries; inner_map_meta->record = btf_record_dup(inner_map->record); if (IS_ERR(inner_map_meta->record)) { /* btf_record_dup returns NULL or valid pointer in case of * invalid/empty/valid, but ERR_PTR in case of errors. During * equality NULL or IS_ERR is equivalent. */ ret = PTR_ERR(inner_map_meta->record); goto free; } /* Note: We must use the same BTF, as we also used btf_record_dup above * which relies on BTF being same for both maps, as some members like * record->fields.list_head have pointers like value_rec pointing into * inner_map->btf. */ if (inner_map->btf) { btf_get(inner_map->btf); inner_map_meta->btf = inner_map->btf; } /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; if (inner_map->ops == &array_map_ops) { struct bpf_array *inner_array_meta = container_of(inner_map_meta, struct bpf_array, map); struct bpf_array *inner_array = container_of(inner_map, struct bpf_array, map); inner_array_meta->index_mask = inner_array->index_mask; inner_array_meta->elem_size = inner_array->elem_size; inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1; } fdput(f); return inner_map_meta; free: kfree(inner_map_meta); put: fdput(f); return ERR_PTR(ret); } void bpf_map_meta_free(struct bpf_map *map_meta) { bpf_map_free_record(map_meta); btf_put(map_meta->btf); kfree(map_meta); } bool bpf_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1) { /* No need to compare ops because it is covered by map_type */ return meta0->map_type == meta1->map_type && meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && meta0->map_flags == meta1->map_flags && btf_record_equal(meta0->record, meta1->record); } void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, int ufd) { struct bpf_map *inner_map, *inner_map_meta; struct fd f; f = fdget(ufd); inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) return inner_map; inner_map_meta = map->inner_map_meta; if (inner_map_meta->ops->map_meta_equal(inner_map_meta, inner_map)) bpf_map_inc(inner_map); else inner_map = ERR_PTR(-EINVAL); fdput(f); return inner_map; } void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { struct bpf_map *inner_map = ptr; /* Defer the freeing of inner map according to the sleepable attribute * of bpf program which owns the outer map, so unnecessary waiting for * RCU tasks trace grace period can be avoided. */ if (need_defer) { if (atomic64_read(&map->sleepable_refcnt)) WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); else WRITE_ONCE(inner_map->free_after_rcu_gp, true); } bpf_map_put(inner_map); } u32 bpf_map_fd_sys_lookup_elem(void *ptr) { return ((struct bpf_map *)ptr)->id; }
2 2 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 /* * linux/fs/nls/nls_koi8-r.c * * Charset koi8-r translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x2500, 0x2502, 0x250c, 0x2510, 0x2514, 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590, /* 0x90*/ 0x2591, 0x2592, 0x2593, 0x2320, 0x25a0, 0x2219, 0x221a, 0x2248, 0x2264, 0x2265, 0x00a0, 0x2321, 0x00b0, 0x00b2, 0x00b7, 0x00f7, /* 0xa0*/ 0x2550, 0x2551, 0x2552, 0x0451, 0x2553, 0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 0x255e, /* 0xb0*/ 0x255f, 0x2560, 0x2561, 0x0401, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x00a9, /* 0xc0*/ 0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433, 0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, /* 0xd0*/ 0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432, 0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a, /* 0xe0*/ 0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413, 0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, /* 0xf0*/ 0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412, 0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0xbf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x9c, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9f, /* 0xf0-0xf7 */ }; static const unsigned char page04[256] = { 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, /* 0x10-0x17 */ 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, /* 0x18-0x1f */ 0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, /* 0x20-0x27 */ 0xfb, 0xfd, 0xff, 0xf9, 0xf8, 0xfc, 0xe0, 0xf1, /* 0x28-0x2f */ 0xc1, 0xc2, 0xd7, 0xc7, 0xc4, 0xc5, 0xd6, 0xda, /* 0x30-0x37 */ 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, /* 0x38-0x3f */ 0xd2, 0xd3, 0xd4, 0xd5, 0xc6, 0xc8, 0xc3, 0xde, /* 0x40-0x47 */ 0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1, /* 0x48-0x4f */ 0x00, 0xa3, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x95, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x98, 0x99, 0x00, 0x00, /* 0x60-0x67 */ }; static const unsigned char page23[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x93, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ }; static const unsigned char page25[256] = { 0x80, 0x00, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x83, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x85, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xa0, 0xa1, 0xa2, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, /* 0x50-0x57 */ 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, /* 0x58-0x5f */ 0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, /* 0x60-0x67 */ 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x8b, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x8d, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x8f, 0x90, 0x91, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x94, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char *const page_uni2charset[256] = { page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page22, page23, NULL, page25, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xa3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xf0-0xf7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xb3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xd0-0xd7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "koi8-r", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_koi8_r(void) { return register_nls(&table); } static void __exit exit_nls_koi8_r(void) { unregister_nls(&table); } module_init(init_nls_koi8_r) module_exit(exit_nls_koi8_r) MODULE_LICENSE("Dual BSD/GPL");
9534 9556 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 // SPDX-License-Identifier: GPL-2.0-only /* * A generic implementation of binary search for the Linux kernel * * Copyright (C) 2008-2009 Ksplice, Inc. * Author: Tim Abbott <tabbott@ksplice.com> */ #include <linux/export.h> #include <linux/bsearch.h> #include <linux/kprobes.h> /* * bsearch - binary search an array of elements * @key: pointer to item being searched for * @base: pointer to first element to search * @num: number of elements * @size: size of each element * @cmp: pointer to comparison function * * This function does a binary search on the given array. The * contents of the array should already be in ascending sorted order * under the provided comparison function. * * Note that the key need not have the same type as the elements in * the array, e.g. key could be a string and the comparison function * could compare the string with the struct's name field. However, if * the key and elements in the array are of the same type, you can use * the same comparison function for both sort() and bsearch(). */ void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp) { return __inline_bsearch(key, base, num, size, cmp); } EXPORT_SYMBOL(bsearch); NOKPROBE_SYMBOL(bsearch);
267 267 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 // SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> #include <linux/bug.h> #include <linux/export.h> #include <linux/limits.h> #include <linux/math.h> #include <linux/minmax.h> #include <linux/types.h> #include <linux/reciprocal_div.h> /* * For a description of the algorithm please have a look at * include/linux/reciprocal_div.h */ struct reciprocal_value reciprocal_value(u32 d) { struct reciprocal_value R; u64 m; int l; l = fls(d - 1); m = ((1ULL << 32) * ((1ULL << l) - d)); do_div(m, d); ++m; R.m = (u32)m; R.sh1 = min(l, 1); R.sh2 = max(l - 1, 0); return R; } EXPORT_SYMBOL(reciprocal_value); struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec) { struct reciprocal_value_adv R; u32 l, post_shift; u64 mhigh, mlow; /* ceil(log2(d)) */ l = fls(d - 1); /* NOTE: mlow/mhigh could overflow u64 when l == 32. This case needs to * be handled before calling "reciprocal_value_adv", please see the * comment at include/linux/reciprocal_div.h. */ WARN(l == 32, "ceil(log2(0x%08x)) == 32, %s doesn't support such divisor", d, __func__); post_shift = l; mlow = 1ULL << (32 + l); do_div(mlow, d); mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec)); do_div(mhigh, d); for (; post_shift > 0; post_shift--) { u64 lo = mlow >> 1, hi = mhigh >> 1; if (lo >= hi) break; mlow = lo; mhigh = hi; } R.m = (u32)mhigh; R.sh = post_shift; R.exp = l; R.is_wide_m = mhigh > U32_MAX; return R; } EXPORT_SYMBOL(reciprocal_value_adv);
20 45 7 40 41 37 29 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 // SPDX-License-Identifier: GPL-2.0 OR MIT /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * This is based in part on Andrew Moon's poly1305-donna, which is in the * public domain. */ #include <linux/kernel.h> #include <asm/unaligned.h> #include <crypto/internal/poly1305.h> void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[POLY1305_BLOCK_SIZE]) { u64 t0, t1; /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ t0 = get_unaligned_le64(&raw_key[0]); t1 = get_unaligned_le64(&raw_key[8]); key->key.r64[0] = t0 & 0xffc0fffffffULL; key->key.r64[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffffULL; key->key.r64[2] = ((t1 >> 24)) & 0x00ffffffc0fULL; /* s = 20*r */ key->precomputed_s.r64[0] = key->key.r64[1] * 20; key->precomputed_s.r64[1] = key->key.r64[2] * 20; } EXPORT_SYMBOL(poly1305_core_setkey); void poly1305_core_blocks(struct poly1305_state *state, const struct poly1305_core_key *key, const void *src, unsigned int nblocks, u32 hibit) { const u8 *input = src; u64 hibit64; u64 r0, r1, r2; u64 s1, s2; u64 h0, h1, h2; u64 c; u128 d0, d1, d2, d; if (!nblocks) return; hibit64 = ((u64)hibit) << 40; r0 = key->key.r64[0]; r1 = key->key.r64[1]; r2 = key->key.r64[2]; h0 = state->h64[0]; h1 = state->h64[1]; h2 = state->h64[2]; s1 = key->precomputed_s.r64[0]; s2 = key->precomputed_s.r64[1]; do { u64 t0, t1; /* h += m[i] */ t0 = get_unaligned_le64(&input[0]); t1 = get_unaligned_le64(&input[8]); h0 += t0 & 0xfffffffffffULL; h1 += ((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL; h2 += (((t1 >> 24)) & 0x3ffffffffffULL) | hibit64; /* h *= r */ d0 = (u128)h0 * r0; d = (u128)h1 * s2; d0 += d; d = (u128)h2 * s1; d0 += d; d1 = (u128)h0 * r1; d = (u128)h1 * r0; d1 += d; d = (u128)h2 * s2; d1 += d; d2 = (u128)h0 * r2; d = (u128)h1 * r1; d2 += d; d = (u128)h2 * r0; d2 += d; /* (partial) h %= p */ c = (u64)(d0 >> 44); h0 = (u64)d0 & 0xfffffffffffULL; d1 += c; c = (u64)(d1 >> 44); h1 = (u64)d1 & 0xfffffffffffULL; d2 += c; c = (u64)(d2 >> 42); h2 = (u64)d2 & 0x3ffffffffffULL; h0 += c * 5; c = h0 >> 44; h0 = h0 & 0xfffffffffffULL; h1 += c; input += POLY1305_BLOCK_SIZE; } while (--nblocks); state->h64[0] = h0; state->h64[1] = h1; state->h64[2] = h2; } EXPORT_SYMBOL(poly1305_core_blocks); void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4], void *dst) { u8 *mac = dst; u64 h0, h1, h2, c; u64 g0, g1, g2; u64 t0, t1; /* fully carry h */ h0 = state->h64[0]; h1 = state->h64[1]; h2 = state->h64[2]; c = h1 >> 44; h1 &= 0xfffffffffffULL; h2 += c; c = h2 >> 42; h2 &= 0x3ffffffffffULL; h0 += c * 5; c = h0 >> 44; h0 &= 0xfffffffffffULL; h1 += c; c = h1 >> 44; h1 &= 0xfffffffffffULL; h2 += c; c = h2 >> 42; h2 &= 0x3ffffffffffULL; h0 += c * 5; c = h0 >> 44; h0 &= 0xfffffffffffULL; h1 += c; /* compute h + -p */ g0 = h0 + 5; c = g0 >> 44; g0 &= 0xfffffffffffULL; g1 = h1 + c; c = g1 >> 44; g1 &= 0xfffffffffffULL; g2 = h2 + c - (1ULL << 42); /* select h if h < p, or h + -p if h >= p */ c = (g2 >> ((sizeof(u64) * 8) - 1)) - 1; g0 &= c; g1 &= c; g2 &= c; c = ~c; h0 = (h0 & c) | g0; h1 = (h1 & c) | g1; h2 = (h2 & c) | g2; if (likely(nonce)) { /* h = (h + nonce) */ t0 = ((u64)nonce[1] << 32) | nonce[0]; t1 = ((u64)nonce[3] << 32) | nonce[2]; h0 += t0 & 0xfffffffffffULL; c = h0 >> 44; h0 &= 0xfffffffffffULL; h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffffULL) + c; c = h1 >> 44; h1 &= 0xfffffffffffULL; h2 += (((t1 >> 24)) & 0x3ffffffffffULL) + c; h2 &= 0x3ffffffffffULL; } /* mac = h % (2^128) */ h0 = h0 | (h1 << 44); h1 = (h1 >> 20) | (h2 << 24); put_unaligned_le64(h0, &mac[0]); put_unaligned_le64(h1, &mac[8]); } EXPORT_SYMBOL(poly1305_core_emit);
8 1 8 8 8 8 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 3 3 3 2 2 2 2 2 2 2 2 2 2 6 1 1 1 4 2 1 1 3 1 1 2 1 1 3 3 3 1 2 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 // SPDX-License-Identifier: GPL-2.0-only /* * cec-adap.c - HDMI Consumer Electronics Control framework - CEC adapter * * Copyright 2016 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/ktime.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/types.h> #include <drm/drm_connector.h> #include <drm/drm_device.h> #include <drm/drm_edid.h> #include <drm/drm_file.h> #include "cec-priv.h" static void cec_fill_msg_report_features(struct cec_adapter *adap, struct cec_msg *msg, unsigned int la_idx); static int cec_log_addr2idx(const struct cec_adapter *adap, u8 log_addr) { int i; for (i = 0; i < adap->log_addrs.num_log_addrs; i++) if (adap->log_addrs.log_addr[i] == log_addr) return i; return -1; } static unsigned int cec_log_addr2dev(const struct cec_adapter *adap, u8 log_addr) { int i = cec_log_addr2idx(adap, log_addr); return adap->log_addrs.primary_device_type[i < 0 ? 0 : i]; } u16 cec_get_edid_phys_addr(const u8 *edid, unsigned int size, unsigned int *offset) { unsigned int loc = cec_get_edid_spa_location(edid, size); if (offset) *offset = loc; if (loc == 0) return CEC_PHYS_ADDR_INVALID; return (edid[loc] << 8) | edid[loc + 1]; } EXPORT_SYMBOL_GPL(cec_get_edid_phys_addr); void cec_fill_conn_info_from_drm(struct cec_connector_info *conn_info, const struct drm_connector *connector) { memset(conn_info, 0, sizeof(*conn_info)); conn_info->type = CEC_CONNECTOR_TYPE_DRM; conn_info->drm.card_no = connector->dev->primary->index; conn_info->drm.connector_id = connector->base.id; } EXPORT_SYMBOL_GPL(cec_fill_conn_info_from_drm); /* * Queue a new event for this filehandle. If ts == 0, then set it * to the current time. * * We keep a queue of at most max_event events where max_event differs * per event. If the queue becomes full, then drop the oldest event and * keep track of how many events we've dropped. */ void cec_queue_event_fh(struct cec_fh *fh, const struct cec_event *new_ev, u64 ts) { static const u16 max_events[CEC_NUM_EVENTS] = { 1, 1, 800, 800, 8, 8, 8, 8 }; struct cec_event_entry *entry; unsigned int ev_idx = new_ev->event - 1; if (WARN_ON(ev_idx >= ARRAY_SIZE(fh->events))) return; if (ts == 0) ts = ktime_get_ns(); mutex_lock(&fh->lock); if (ev_idx < CEC_NUM_CORE_EVENTS) entry = &fh->core_events[ev_idx]; else entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (entry) { if (new_ev->event == CEC_EVENT_LOST_MSGS && fh->queued_events[ev_idx]) { entry->ev.lost_msgs.lost_msgs += new_ev->lost_msgs.lost_msgs; goto unlock; } entry->ev = *new_ev; entry->ev.ts = ts; if (fh->queued_events[ev_idx] < max_events[ev_idx]) { /* Add new msg at the end of the queue */ list_add_tail(&entry->list, &fh->events[ev_idx]); fh->queued_events[ev_idx]++; fh->total_queued_events++; goto unlock; } if (ev_idx >= CEC_NUM_CORE_EVENTS) { list_add_tail(&entry->list, &fh->events[ev_idx]); /* drop the oldest event */ entry = list_first_entry(&fh->events[ev_idx], struct cec_event_entry, list); list_del(&entry->list); kfree(entry); } } /* Mark that events were lost */ entry = list_first_entry_or_null(&fh->events[ev_idx], struct cec_event_entry, list); if (entry) entry->ev.flags |= CEC_EVENT_FL_DROPPED_EVENTS; unlock: mutex_unlock(&fh->lock); wake_up_interruptible(&fh->wait); } /* Queue a new event for all open filehandles. */ static void cec_queue_event(struct cec_adapter *adap, const struct cec_event *ev) { u64 ts = ktime_get_ns(); struct cec_fh *fh; mutex_lock(&adap->devnode.lock_fhs); list_for_each_entry(fh, &adap->devnode.fhs, list) cec_queue_event_fh(fh, ev, ts); mutex_unlock(&adap->devnode.lock_fhs); } /* Notify userspace that the CEC pin changed state at the given time. */ void cec_queue_pin_cec_event(struct cec_adapter *adap, bool is_high, bool dropped_events, ktime_t ts) { struct cec_event ev = { .event = is_high ? CEC_EVENT_PIN_CEC_HIGH : CEC_EVENT_PIN_CEC_LOW, .flags = dropped_events ? CEC_EVENT_FL_DROPPED_EVENTS : 0, }; struct cec_fh *fh; mutex_lock(&adap->devnode.lock_fhs); list_for_each_entry(fh, &adap->devnode.fhs, list) { if (fh->mode_follower == CEC_MODE_MONITOR_PIN) cec_queue_event_fh(fh, &ev, ktime_to_ns(ts)); } mutex_unlock(&adap->devnode.lock_fhs); } EXPORT_SYMBOL_GPL(cec_queue_pin_cec_event); /* Notify userspace that the HPD pin changed state at the given time. */ void cec_queue_pin_hpd_event(struct cec_adapter *adap, bool is_high, ktime_t ts) { struct cec_event ev = { .event = is_high ? CEC_EVENT_PIN_HPD_HIGH : CEC_EVENT_PIN_HPD_LOW, }; struct cec_fh *fh; mutex_lock(&adap->devnode.lock_fhs); list_for_each_entry(fh, &adap->devnode.fhs, list) cec_queue_event_fh(fh, &ev, ktime_to_ns(ts)); mutex_unlock(&adap->devnode.lock_fhs); } EXPORT_SYMBOL_GPL(cec_queue_pin_hpd_event); /* Notify userspace that the 5V pin changed state at the given time. */ void cec_queue_pin_5v_event(struct cec_adapter *adap, bool is_high, ktime_t ts) { struct cec_event ev = { .event = is_high ? CEC_EVENT_PIN_5V_HIGH : CEC_EVENT_PIN_5V_LOW, }; struct cec_fh *fh; mutex_lock(&adap->devnode.lock_fhs); list_for_each_entry(fh, &adap->devnode.fhs, list) cec_queue_event_fh(fh, &ev, ktime_to_ns(ts)); mutex_unlock(&adap->devnode.lock_fhs); } EXPORT_SYMBOL_GPL(cec_queue_pin_5v_event); /* * Queue a new message for this filehandle. * * We keep a queue of at most CEC_MAX_MSG_RX_QUEUE_SZ messages. If the * queue becomes full, then drop the oldest message and keep track * of how many messages we've dropped. */ static void cec_queue_msg_fh(struct cec_fh *fh, const struct cec_msg *msg) { static const struct cec_event ev_lost_msgs = { .event = CEC_EVENT_LOST_MSGS, .flags = 0, { .lost_msgs = { 1 }, }, }; struct cec_msg_entry *entry; mutex_lock(&fh->lock); entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (entry) { entry->msg = *msg; /* Add new msg at the end of the queue */ list_add_tail(&entry->list, &fh->msgs); if (fh->queued_msgs < CEC_MAX_MSG_RX_QUEUE_SZ) { /* All is fine if there is enough room */ fh->queued_msgs++; mutex_unlock(&fh->lock); wake_up_interruptible(&fh->wait); return; } /* * if the message queue is full, then drop the oldest one and * send a lost message event. */ entry = list_first_entry(&fh->msgs, struct cec_msg_entry, list); list_del(&entry->list); kfree(entry); } mutex_unlock(&fh->lock); /* * We lost a message, either because kmalloc failed or the queue * was full. */ cec_queue_event_fh(fh, &ev_lost_msgs, ktime_get_ns()); } /* * Queue the message for those filehandles that are in monitor mode. * If valid_la is true (this message is for us or was sent by us), * then pass it on to any monitoring filehandle. If this message * isn't for us or from us, then only give it to filehandles that * are in MONITOR_ALL mode. * * This can only happen if the CEC_CAP_MONITOR_ALL capability is * set and the CEC adapter was placed in 'monitor all' mode. */ static void cec_queue_msg_monitor(struct cec_adapter *adap, const struct cec_msg *msg, bool valid_la) { struct cec_fh *fh; u32 monitor_mode = valid_la ? CEC_MODE_MONITOR : CEC_MODE_MONITOR_ALL; mutex_lock(&adap->devnode.lock_fhs); list_for_each_entry(fh, &adap->devnode.fhs, list) { if (fh->mode_follower >= monitor_mode) cec_queue_msg_fh(fh, msg); } mutex_unlock(&adap->devnode.lock_fhs); } /* * Queue the message for follower filehandles. */ static void cec_queue_msg_followers(struct cec_adapter *adap, const struct cec_msg *msg) { struct cec_fh *fh; mutex_lock(&adap->devnode.lock_fhs); list_for_each_entry(fh, &adap->devnode.fhs, list) { if (fh->mode_follower == CEC_MODE_FOLLOWER) cec_queue_msg_fh(fh, msg); } mutex_unlock(&adap->devnode.lock_fhs); } /* Notify userspace of an adapter state change. */ static void cec_post_state_event(struct cec_adapter *adap) { struct cec_event ev = { .event = CEC_EVENT_STATE_CHANGE, }; ev.state_change.phys_addr = adap->phys_addr; ev.state_change.log_addr_mask = adap->log_addrs.log_addr_mask; ev.state_change.have_conn_info = adap->conn_info.type != CEC_CONNECTOR_TYPE_NO_CONNECTOR; cec_queue_event(adap, &ev); } /* * A CEC transmit (and a possible wait for reply) completed. * If this was in blocking mode, then complete it, otherwise * queue the message for userspace to dequeue later. * * This function is called with adap->lock held. */ static void cec_data_completed(struct cec_data *data) { /* * Delete this transmit from the filehandle's xfer_list since * we're done with it. * * Note that if the filehandle is closed before this transmit * finished, then the release() function will set data->fh to NULL. * Without that we would be referring to a closed filehandle. */ if (data->fh) list_del_init(&data->xfer_list); if (data->blocking) { /* * Someone is blocking so mark the message as completed * and call complete. */ data->completed = true; complete(&data->c); } else { /* * No blocking, so just queue the message if needed and * free the memory. */ if (data->fh) cec_queue_msg_fh(data->fh, &data->msg); kfree(data); } } /* * A pending CEC transmit needs to be cancelled, either because the CEC * adapter is disabled or the transmit takes an impossibly long time to * finish, or the reply timed out. * * This function is called with adap->lock held. */ static void cec_data_cancel(struct cec_data *data, u8 tx_status, u8 rx_status) { struct cec_adapter *adap = data->adap; /* * It's either the current transmit, or it is a pending * transmit. Take the appropriate action to clear it. */ if (adap->transmitting == data) { adap->transmitting = NULL; } else { list_del_init(&data->list); if (!(data->msg.tx_status & CEC_TX_STATUS_OK)) if (!WARN_ON(!adap->transmit_queue_sz)) adap->transmit_queue_sz--; } if (data->msg.tx_status & CEC_TX_STATUS_OK) { data->msg.rx_ts = ktime_get_ns(); data->msg.rx_status = rx_status; if (!data->blocking) data->msg.tx_status = 0; } else { data->msg.tx_ts = ktime_get_ns(); data->msg.tx_status |= tx_status | CEC_TX_STATUS_MAX_RETRIES; data->msg.tx_error_cnt++; data->attempts = 0; if (!data->blocking) data->msg.rx_status = 0; } /* Queue transmitted message for monitoring purposes */ cec_queue_msg_monitor(adap, &data->msg, 1); if (!data->blocking && data->msg.sequence) /* Allow drivers to react to a canceled transmit */ call_void_op(adap, adap_nb_transmit_canceled, &data->msg); cec_data_completed(data); } /* * Flush all pending transmits and cancel any pending timeout work. * * This function is called with adap->lock held. */ static void cec_flush(struct cec_adapter *adap) { struct cec_data *data, *n; /* * If the adapter is disabled, or we're asked to stop, * then cancel any pending transmits. */ while (!list_empty(&adap->transmit_queue)) { data = list_first_entry(&adap->transmit_queue, struct cec_data, list); cec_data_cancel(data, CEC_TX_STATUS_ABORTED, 0); } if (adap->transmitting) adap->transmit_in_progress_aborted = true; /* Cancel the pending timeout work. */ list_for_each_entry_safe(data, n, &adap->wait_queue, list) { if (cancel_delayed_work(&data->work)) cec_data_cancel(data, CEC_TX_STATUS_OK, CEC_RX_STATUS_ABORTED); /* * If cancel_delayed_work returned false, then * the cec_wait_timeout function is running, * which will call cec_data_completed. So no * need to do anything special in that case. */ } /* * If something went wrong and this counter isn't what it should * be, then this will reset it back to 0. Warn if it is not 0, * since it indicates a bug, either in this framework or in a * CEC driver. */ if (WARN_ON(adap->transmit_queue_sz)) adap->transmit_queue_sz = 0; } /* * Main CEC state machine * * Wait until the thread should be stopped, or we are not transmitting and * a new transmit message is queued up, in which case we start transmitting * that message. When the adapter finished transmitting the message it will * call cec_transmit_done(). * * If the adapter is disabled, then remove all queued messages instead. * * If the current transmit times out, then cancel that transmit. */ int cec_thread_func(void *_adap) { struct cec_adapter *adap = _adap; for (;;) { unsigned int signal_free_time; struct cec_data *data; bool timeout = false; u8 attempts; if (adap->transmit_in_progress) { int err; /* * We are transmitting a message, so add a timeout * to prevent the state machine to get stuck waiting * for this message to finalize and add a check to * see if the adapter is disabled in which case the * transmit should be canceled. */ err = wait_event_interruptible_timeout(adap->kthread_waitq, (adap->needs_hpd && (!adap->is_configured && !adap->is_configuring)) || kthread_should_stop() || (!adap->transmit_in_progress && !list_empty(&adap->transmit_queue)), msecs_to_jiffies(adap->xfer_timeout_ms)); timeout = err == 0; } else { /* Otherwise we just wait for something to happen. */ wait_event_interruptible(adap->kthread_waitq, kthread_should_stop() || (!adap->transmit_in_progress && !list_empty(&adap->transmit_queue))); } mutex_lock(&adap->lock); if ((adap->needs_hpd && (!adap->is_configured && !adap->is_configuring)) || kthread_should_stop()) { cec_flush(adap); goto unlock; } if (adap->transmit_in_progress && timeout) { /* * If we timeout, then log that. Normally this does * not happen and it is an indication of a faulty CEC * adapter driver, or the CEC bus is in some weird * state. On rare occasions it can happen if there is * so much traffic on the bus that the adapter was * unable to transmit for xfer_timeout_ms (2.1s by * default). */ if (adap->transmitting) { pr_warn("cec-%s: message %*ph timed out\n", adap->name, adap->transmitting->msg.len, adap->transmitting->msg.msg); /* Just give up on this. */ cec_data_cancel(adap->transmitting, CEC_TX_STATUS_TIMEOUT, 0); } else { pr_warn("cec-%s: transmit timed out\n", adap->name); } adap->transmit_in_progress = false; adap->tx_timeout_cnt++; goto unlock; } /* * If we are still transmitting, or there is nothing new to * transmit, then just continue waiting. */ if (adap->transmit_in_progress || list_empty(&adap->transmit_queue)) goto unlock; /* Get a new message to transmit */ data = list_first_entry(&adap->transmit_queue, struct cec_data, list); list_del_init(&data->list); if (!WARN_ON(!data->adap->transmit_queue_sz)) adap->transmit_queue_sz--; /* Make this the current transmitting message */ adap->transmitting = data; /* * Suggested number of attempts as per the CEC 2.0 spec: * 4 attempts is the default, except for 'secondary poll * messages', i.e. poll messages not sent during the adapter * configuration phase when it allocates logical addresses. */ if (data->msg.len == 1 && adap->is_configured) attempts = 2; else attempts = 4; /* Set the suggested signal free time */ if (data->attempts) { /* should be >= 3 data bit periods for a retry */ signal_free_time = CEC_SIGNAL_FREE_TIME_RETRY; } else if (adap->last_initiator != cec_msg_initiator(&data->msg)) { /* should be >= 5 data bit periods for new initiator */ signal_free_time = CEC_SIGNAL_FREE_TIME_NEW_INITIATOR; adap->last_initiator = cec_msg_initiator(&data->msg); } else { /* * should be >= 7 data bit periods for sending another * frame immediately after another. */ signal_free_time = CEC_SIGNAL_FREE_TIME_NEXT_XFER; } if (data->attempts == 0) data->attempts = attempts; adap->transmit_in_progress_aborted = false; /* Tell the adapter to transmit, cancel on error */ if (call_op(adap, adap_transmit, data->attempts, signal_free_time, &data->msg)) cec_data_cancel(data, CEC_TX_STATUS_ABORTED, 0); else adap->transmit_in_progress = true; unlock: mutex_unlock(&adap->lock); if (kthread_should_stop()) break; } return 0; } /* * Called by the CEC adapter if a transmit finished. */ void cec_transmit_done_ts(struct cec_adapter *adap, u8 status, u8 arb_lost_cnt, u8 nack_cnt, u8 low_drive_cnt, u8 error_cnt, ktime_t ts) { struct cec_data *data; struct cec_msg *msg; unsigned int attempts_made = arb_lost_cnt + nack_cnt + low_drive_cnt + error_cnt; bool done = status & (CEC_TX_STATUS_MAX_RETRIES | CEC_TX_STATUS_OK); bool aborted = adap->transmit_in_progress_aborted; dprintk(2, "%s: status 0x%02x\n", __func__, status); if (attempts_made < 1) attempts_made = 1; mutex_lock(&adap->lock); data = adap->transmitting; if (!data) { /* * This might happen if a transmit was issued and the cable is * unplugged while the transmit is ongoing. Ignore this * transmit in that case. */ if (!adap->transmit_in_progress) dprintk(1, "%s was called without an ongoing transmit!\n", __func__); adap->transmit_in_progress = false; goto wake_thread; } adap->transmit_in_progress = false; adap->transmit_in_progress_aborted = false; msg = &data->msg; /* Drivers must fill in the status! */ WARN_ON(status == 0); msg->tx_ts = ktime_to_ns(ts); msg->tx_status |= status; msg->tx_arb_lost_cnt += arb_lost_cnt; msg->tx_nack_cnt += nack_cnt; msg->tx_low_drive_cnt += low_drive_cnt; msg->tx_error_cnt += error_cnt; adap->tx_arb_lost_cnt += arb_lost_cnt; adap->tx_low_drive_cnt += low_drive_cnt; adap->tx_error_cnt += error_cnt; /* * Low Drive transmission errors should really not happen for * well-behaved CEC devices and proper HDMI cables. * * Ditto for the 'Error' status. * * For the first few times that this happens, log this. * Stop logging after that, since that will not add any more * useful information and instead it will just flood the kernel log. */ if (done && adap->tx_low_drive_log_cnt < 8 && msg->tx_low_drive_cnt) { adap->tx_low_drive_log_cnt++; dprintk(0, "low drive counter: %u (seq %u: %*ph)\n", msg->tx_low_drive_cnt, msg->sequence, msg->len, msg->msg); } if (done && adap->tx_error_log_cnt < 8 && msg->tx_error_cnt) { adap->tx_error_log_cnt++; dprintk(0, "error counter: %u (seq %u: %*ph)\n", msg->tx_error_cnt, msg->sequence, msg->len, msg->msg); } /* Mark that we're done with this transmit */ adap->transmitting = NULL; /* * If there are still retry attempts left and there was an error and * the hardware didn't signal that it retried itself (by setting * CEC_TX_STATUS_MAX_RETRIES), then we will retry ourselves. */ if (!aborted && data->attempts > attempts_made && !done) { /* Retry this message */ data->attempts -= attempts_made; if (msg->timeout) dprintk(2, "retransmit: %*ph (attempts: %d, wait for 0x%02x)\n", msg->len, msg->msg, data->attempts, msg->reply); else dprintk(2, "retransmit: %*ph (attempts: %d)\n", msg->len, msg->msg, data->attempts); /* Add the message in front of the transmit queue */ list_add(&data->list, &adap->transmit_queue); adap->transmit_queue_sz++; goto wake_thread; } if (aborted && !done) status |= CEC_TX_STATUS_ABORTED; data->attempts = 0; /* Always set CEC_TX_STATUS_MAX_RETRIES on error */ if (!(status & CEC_TX_STATUS_OK)) msg->tx_status |= CEC_TX_STATUS_MAX_RETRIES; /* Queue transmitted message for monitoring purposes */ cec_queue_msg_monitor(adap, msg, 1); if ((status & CEC_TX_STATUS_OK) && adap->is_configured && msg->timeout) { /* * Queue the message into the wait queue if we want to wait * for a reply. */ list_add_tail(&data->list, &adap->wait_queue); schedule_delayed_work(&data->work, msecs_to_jiffies(msg->timeout)); } else { /* Otherwise we're done */ cec_data_completed(data); } wake_thread: /* * Wake up the main thread to see if another message is ready * for transmitting or to retry the current message. */ wake_up_interruptible(&adap->kthread_waitq); mutex_unlock(&adap->lock); } EXPORT_SYMBOL_GPL(cec_transmit_done_ts); void cec_transmit_attempt_done_ts(struct cec_adapter *adap, u8 status, ktime_t ts) { switch (status & ~CEC_TX_STATUS_MAX_RETRIES) { case CEC_TX_STATUS_OK: cec_transmit_done_ts(adap, status, 0, 0, 0, 0, ts); return; case CEC_TX_STATUS_ARB_LOST: cec_transmit_done_ts(adap, status, 1, 0, 0, 0, ts); return; case CEC_TX_STATUS_NACK: cec_transmit_done_ts(adap, status, 0, 1, 0, 0, ts); return; case CEC_TX_STATUS_LOW_DRIVE: cec_transmit_done_ts(adap, status, 0, 0, 1, 0, ts); return; case CEC_TX_STATUS_ERROR: cec_transmit_done_ts(adap, status, 0, 0, 0, 1, ts); return; default: /* Should never happen */ WARN(1, "cec-%s: invalid status 0x%02x\n", adap->name, status); return; } } EXPORT_SYMBOL_GPL(cec_transmit_attempt_done_ts); /* * Called when waiting for a reply times out. */ static void cec_wait_timeout(struct work_struct *work) { struct cec_data *data = container_of(work, struct cec_data, work.work); struct cec_adapter *adap = data->adap; mutex_lock(&adap->lock); /* * Sanity check in case the timeout and the arrival of the message * happened at the same time. */ if (list_empty(&data->list)) goto unlock; /* Mark the message as timed out */ list_del_init(&data->list); cec_data_cancel(data, CEC_TX_STATUS_OK, CEC_RX_STATUS_TIMEOUT); unlock: mutex_unlock(&adap->lock); } /* * Transmit a message. The fh argument may be NULL if the transmit is not * associated with a specific filehandle. * * This function is called with adap->lock held. */ int cec_transmit_msg_fh(struct cec_adapter *adap, struct cec_msg *msg, struct cec_fh *fh, bool block) { struct cec_data *data; bool is_raw = msg_is_raw(msg); if (adap->devnode.unregistered) return -ENODEV; msg->rx_ts = 0; msg->tx_ts = 0; msg->rx_status = 0; msg->tx_status = 0; msg->tx_arb_lost_cnt = 0; msg->tx_nack_cnt = 0; msg->tx_low_drive_cnt = 0; msg->tx_error_cnt = 0; msg->sequence = 0; if (msg->reply && msg->timeout == 0) { /* Make sure the timeout isn't 0. */ msg->timeout = 1000; } msg->flags &= CEC_MSG_FL_REPLY_TO_FOLLOWERS | CEC_MSG_FL_RAW; if (!msg->timeout) msg->flags &= ~CEC_MSG_FL_REPLY_TO_FOLLOWERS; /* Sanity checks */ if (msg->len == 0 || msg->len > CEC_MAX_MSG_SIZE) { dprintk(1, "%s: invalid length %d\n", __func__, msg->len); return -EINVAL; } memset(msg->msg + msg->len, 0, sizeof(msg->msg) - msg->len); if (msg->timeout) dprintk(2, "%s: %*ph (wait for 0x%02x%s)\n", __func__, msg->len, msg->msg, msg->reply, !block ? ", nb" : ""); else dprintk(2, "%s: %*ph%s\n", __func__, msg->len, msg->msg, !block ? " (nb)" : ""); if (msg->timeout && msg->len == 1) { dprintk(1, "%s: can't reply to poll msg\n", __func__); return -EINVAL; } if (is_raw) { if (!capable(CAP_SYS_RAWIO)) return -EPERM; } else { /* A CDC-Only device can only send CDC messages */ if ((adap->log_addrs.flags & CEC_LOG_ADDRS_FL_CDC_ONLY) && (msg->len == 1 || msg->msg[1] != CEC_MSG_CDC_MESSAGE)) { dprintk(1, "%s: not a CDC message\n", __func__); return -EINVAL; } if (msg->len >= 4 && msg->msg[1] == CEC_MSG_CDC_MESSAGE) { msg->msg[2] = adap->phys_addr >> 8; msg->msg[3] = adap->phys_addr & 0xff; } if (msg->len == 1) { if (cec_msg_destination(msg) == 0xf) { dprintk(1, "%s: invalid poll message\n", __func__); return -EINVAL; } if (cec_has_log_addr(adap, cec_msg_destination(msg))) { /* * If the destination is a logical address our * adapter has already claimed, then just NACK * this. It depends on the hardware what it will * do with a POLL to itself (some OK this), so * it is just as easy to handle it here so the * behavior will be consistent. */ msg->tx_ts = ktime_get_ns(); msg->tx_status = CEC_TX_STATUS_NACK | CEC_TX_STATUS_MAX_RETRIES; msg->tx_nack_cnt = 1; msg->sequence = ++adap->sequence; if (!msg->sequence) msg->sequence = ++adap->sequence; return 0; } } if (msg->len > 1 && !cec_msg_is_broadcast(msg) && cec_has_log_addr(adap, cec_msg_destination(msg))) { dprintk(1, "%s: destination is the adapter itself\n", __func__); return -EINVAL; } if (msg->len > 1 && adap->is_configured && !cec_has_log_addr(adap, cec_msg_initiator(msg))) { dprintk(1, "%s: initiator has unknown logical address %d\n", __func__, cec_msg_initiator(msg)); return -EINVAL; } /* * Special case: allow Ping and IMAGE/TEXT_VIEW_ON to be * transmitted to a TV, even if the adapter is unconfigured. * This makes it possible to detect or wake up displays that * pull down the HPD when in standby. */ if (!adap->is_configured && !adap->is_configuring && (msg->len > 2 || cec_msg_destination(msg) != CEC_LOG_ADDR_TV || (msg->len == 2 && msg->msg[1] != CEC_MSG_IMAGE_VIEW_ON && msg->msg[1] != CEC_MSG_TEXT_VIEW_ON))) { dprintk(1, "%s: adapter is unconfigured\n", __func__); return -ENONET; } } if (!adap->is_configured && !adap->is_configuring) { if (adap->needs_hpd) { dprintk(1, "%s: adapter is unconfigured and needs HPD\n", __func__); return -ENONET; } if (msg->reply) { dprintk(1, "%s: invalid msg->reply\n", __func__); return -EINVAL; } } if (adap->transmit_queue_sz >= CEC_MAX_MSG_TX_QUEUE_SZ) { dprintk(2, "%s: transmit queue full\n", __func__); return -EBUSY; } data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; msg->sequence = ++adap->sequence; if (!msg->sequence) msg->sequence = ++adap->sequence; data->msg = *msg; data->fh = fh; data->adap = adap; data->blocking = block; init_completion(&data->c); INIT_DELAYED_WORK(&data->work, cec_wait_timeout); if (fh) list_add_tail(&data->xfer_list, &fh->xfer_list); else INIT_LIST_HEAD(&data->xfer_list); list_add_tail(&data->list, &adap->transmit_queue); adap->transmit_queue_sz++; if (!adap->transmitting) wake_up_interruptible(&adap->kthread_waitq); /* All done if we don't need to block waiting for completion */ if (!block) return 0; /* * Release the lock and wait, retake the lock afterwards. */ mutex_unlock(&adap->lock); wait_for_completion_killable(&data->c); if (!data->completed) cancel_delayed_work_sync(&data->work); mutex_lock(&adap->lock); /* Cancel the transmit if it was interrupted */ if (!data->completed) { if (data->msg.tx_status & CEC_TX_STATUS_OK) cec_data_cancel(data, CEC_TX_STATUS_OK, CEC_RX_STATUS_ABORTED); else cec_data_cancel(data, CEC_TX_STATUS_ABORTED, 0); } /* The transmit completed (possibly with an error) */ *msg = data->msg; if (WARN_ON(!list_empty(&data->list))) list_del(&data->list); if (WARN_ON(!list_empty(&data->xfer_list))) list_del(&data->xfer_list); kfree(data); return 0; } /* Helper function to be used by drivers and this framework. */ int cec_transmit_msg(struct cec_adapter *adap, struct cec_msg *msg, bool block) { int ret; mutex_lock(&adap->lock); ret = cec_transmit_msg_fh(adap, msg, NULL, block); mutex_unlock(&adap->lock); return ret; } EXPORT_SYMBOL_GPL(cec_transmit_msg); /* * I don't like forward references but without this the low-level * cec_received_msg() function would come after a bunch of high-level * CEC protocol handling functions. That was very confusing. */ static int cec_receive_notify(struct cec_adapter *adap, struct cec_msg *msg, bool is_reply); #define DIRECTED 0x80 #define BCAST1_4 0x40 #define BCAST2_0 0x20 /* broadcast only allowed for >= 2.0 */ #define BCAST (BCAST1_4 | BCAST2_0) #define BOTH (BCAST | DIRECTED) /* * Specify minimum length and whether the message is directed, broadcast * or both. Messages that do not match the criteria are ignored as per * the CEC specification. */ static const u8 cec_msg_size[256] = { [CEC_MSG_ACTIVE_SOURCE] = 4 | BCAST, [CEC_MSG_IMAGE_VIEW_ON] = 2 | DIRECTED, [CEC_MSG_TEXT_VIEW_ON] = 2 | DIRECTED, [CEC_MSG_INACTIVE_SOURCE] = 4 | DIRECTED, [CEC_MSG_REQUEST_ACTIVE_SOURCE] = 2 | BCAST, [CEC_MSG_ROUTING_CHANGE] = 6 | BCAST, [CEC_MSG_ROUTING_INFORMATION] = 4 | BCAST, [CEC_MSG_SET_STREAM_PATH] = 4 | BCAST, [CEC_MSG_STANDBY] = 2 | BOTH, [CEC_MSG_RECORD_OFF] = 2 | DIRECTED, [CEC_MSG_RECORD_ON] = 3 | DIRECTED, [CEC_MSG_RECORD_STATUS] = 3 | DIRECTED, [CEC_MSG_RECORD_TV_SCREEN] = 2 | DIRECTED, [CEC_MSG_CLEAR_ANALOGUE_TIMER] = 13 | DIRECTED, [CEC_MSG_CLEAR_DIGITAL_TIMER] = 16 | DIRECTED, [CEC_MSG_CLEAR_EXT_TIMER] = 13 | DIRECTED, [CEC_MSG_SET_ANALOGUE_TIMER] = 13 | DIRECTED, [CEC_MSG_SET_DIGITAL_TIMER] = 16 | DIRECTED, [CEC_MSG_SET_EXT_TIMER] = 13 | DIRECTED, [CEC_MSG_SET_TIMER_PROGRAM_TITLE] = 2 | DIRECTED, [CEC_MSG_TIMER_CLEARED_STATUS] = 3 | DIRECTED, [CEC_MSG_TIMER_STATUS] = 3 | DIRECTED, [CEC_MSG_CEC_VERSION] = 3 | DIRECTED, [CEC_MSG_GET_CEC_VERSION] = 2 | DIRECTED, [CEC_MSG_GIVE_PHYSICAL_ADDR] = 2 | DIRECTED, [CEC_MSG_GET_MENU_LANGUAGE] = 2 | DIRECTED, [CEC_MSG_REPORT_PHYSICAL_ADDR] = 5 | BCAST, [CEC_MSG_SET_MENU_LANGUAGE] = 5 | BCAST, [CEC_MSG_REPORT_FEATURES] = 6 | BCAST, [CEC_MSG_GIVE_FEATURES] = 2 | DIRECTED, [CEC_MSG_DECK_CONTROL] = 3 | DIRECTED, [CEC_MSG_DECK_STATUS] = 3 | DIRECTED, [CEC_MSG_GIVE_DECK_STATUS] = 3 | DIRECTED, [CEC_MSG_PLAY] = 3 | DIRECTED, [CEC_MSG_GIVE_TUNER_DEVICE_STATUS] = 3 | DIRECTED, [CEC_MSG_SELECT_ANALOGUE_SERVICE] = 6 | DIRECTED, [CEC_MSG_SELECT_DIGITAL_SERVICE] = 9 | DIRECTED, [CEC_MSG_TUNER_DEVICE_STATUS] = 7 | DIRECTED, [CEC_MSG_TUNER_STEP_DECREMENT] = 2 | DIRECTED, [CEC_MSG_TUNER_STEP_INCREMENT] = 2 | DIRECTED, [CEC_MSG_DEVICE_VENDOR_ID] = 5 | BCAST, [CEC_MSG_GIVE_DEVICE_VENDOR_ID] = 2 | DIRECTED, [CEC_MSG_VENDOR_COMMAND] = 2 | DIRECTED, [CEC_MSG_VENDOR_COMMAND_WITH_ID] = 5 | BOTH, [CEC_MSG_VENDOR_REMOTE_BUTTON_DOWN] = 2 | BOTH, [CEC_MSG_VENDOR_REMOTE_BUTTON_UP] = 2 | BOTH, [CEC_MSG_SET_OSD_STRING] = 3 | DIRECTED, [CEC_MSG_GIVE_OSD_NAME] = 2 | DIRECTED, [CEC_MSG_SET_OSD_NAME] = 2 | DIRECTED, [CEC_MSG_MENU_REQUEST] = 3 | DIRECTED, [CEC_MSG_MENU_STATUS] = 3 | DIRECTED, [CEC_MSG_USER_CONTROL_PRESSED] = 3 | DIRECTED, [CEC_MSG_USER_CONTROL_RELEASED] = 2 | DIRECTED, [CEC_MSG_GIVE_DEVICE_POWER_STATUS] = 2 | DIRECTED, [CEC_MSG_REPORT_POWER_STATUS] = 3 | DIRECTED | BCAST2_0, [CEC_MSG_FEATURE_ABORT] = 4 | DIRECTED, [CEC_MSG_ABORT] = 2 | DIRECTED, [CEC_MSG_GIVE_AUDIO_STATUS] = 2 | DIRECTED, [CEC_MSG_GIVE_SYSTEM_AUDIO_MODE_STATUS] = 2 | DIRECTED, [CEC_MSG_REPORT_AUDIO_STATUS] = 3 | DIRECTED, [CEC_MSG_REPORT_SHORT_AUDIO_DESCRIPTOR] = 2 | DIRECTED, [CEC_MSG_REQUEST_SHORT_AUDIO_DESCRIPTOR] = 2 | DIRECTED, [CEC_MSG_SET_SYSTEM_AUDIO_MODE] = 3 | BOTH, [CEC_MSG_SET_AUDIO_VOLUME_LEVEL] = 3 | DIRECTED, [CEC_MSG_SYSTEM_AUDIO_MODE_REQUEST] = 2 | DIRECTED, [CEC_MSG_SYSTEM_AUDIO_MODE_STATUS] = 3 | DIRECTED, [CEC_MSG_SET_AUDIO_RATE] = 3 | DIRECTED, [CEC_MSG_INITIATE_ARC] = 2 | DIRECTED, [CEC_MSG_REPORT_ARC_INITIATED] = 2 | DIRECTED, [CEC_MSG_REPORT_ARC_TERMINATED] = 2 | DIRECTED, [CEC_MSG_REQUEST_ARC_INITIATION] = 2 | DIRECTED, [CEC_MSG_REQUEST_ARC_TERMINATION] = 2 | DIRECTED, [CEC_MSG_TERMINATE_ARC] = 2 | DIRECTED, [CEC_MSG_REQUEST_CURRENT_LATENCY] = 4 | BCAST, [CEC_MSG_REPORT_CURRENT_LATENCY] = 6 | BCAST, [CEC_MSG_CDC_MESSAGE] = 2 | BCAST, }; /* Called by the CEC adapter if a message is received */ void cec_received_msg_ts(struct cec_adapter *adap, struct cec_msg *msg, ktime_t ts) { struct cec_data *data; u8 msg_init = cec_msg_initiator(msg); u8 msg_dest = cec_msg_destination(msg); u8 cmd = msg->msg[1]; bool is_reply = false; bool valid_la = true; bool monitor_valid_la = true; u8 min_len = 0; if (WARN_ON(!msg->len || msg->len > CEC_MAX_MSG_SIZE)) return; if (adap->devnode.unregistered) return; /* * Some CEC adapters will receive the messages that they transmitted. * This test filters out those messages by checking if we are the * initiator, and just returning in that case. * * Note that this won't work if this is an Unregistered device. * * It is bad practice if the hardware receives the message that it * transmitted and luckily most CEC adapters behave correctly in this * respect. */ if (msg_init != CEC_LOG_ADDR_UNREGISTERED && cec_has_log_addr(adap, msg_init)) return; msg->rx_ts = ktime_to_ns(ts); msg->rx_status = CEC_RX_STATUS_OK; msg->sequence = msg->reply = msg->timeout = 0; msg->tx_status = 0; msg->tx_ts = 0; msg->tx_arb_lost_cnt = 0; msg->tx_nack_cnt = 0; msg->tx_low_drive_cnt = 0; msg->tx_error_cnt = 0; msg->flags = 0; memset(msg->msg + msg->len, 0, sizeof(msg->msg) - msg->len); mutex_lock(&adap->lock); dprintk(2, "%s: %*ph\n", __func__, msg->len, msg->msg); if (!adap->transmit_in_progress) adap->last_initiator = 0xff; /* Check if this message was for us (directed or broadcast). */ if (!cec_msg_is_broadcast(msg)) { valid_la = cec_has_log_addr(adap, msg_dest); monitor_valid_la = valid_la; } /* * Check if the length is not too short or if the message is a * broadcast message where a directed message was expected or * vice versa. If so, then the message has to be ignored (according * to section CEC 7.3 and CEC 12.2). */ if (valid_la && msg->len > 1 && cec_msg_size[cmd]) { u8 dir_fl = cec_msg_size[cmd] & BOTH; min_len = cec_msg_size[cmd] & 0x1f; if (msg->len < min_len) valid_la = false; else if (!cec_msg_is_broadcast(msg) && !(dir_fl & DIRECTED)) valid_la = false; else if (cec_msg_is_broadcast(msg) && !(dir_fl & BCAST)) valid_la = false; else if (cec_msg_is_broadcast(msg) && adap->log_addrs.cec_version < CEC_OP_CEC_VERSION_2_0 && !(dir_fl & BCAST1_4)) valid_la = false; } if (valid_la && min_len) { /* These messages have special length requirements */ switch (cmd) { case CEC_MSG_TIMER_STATUS: if (msg->msg[2] & 0x10) { switch (msg->msg[2] & 0xf) { case CEC_OP_PROG_INFO_NOT_ENOUGH_SPACE: case CEC_OP_PROG_INFO_MIGHT_NOT_BE_ENOUGH_SPACE: if (msg->len < 5) valid_la = false; break; } } else if ((msg->msg[2] & 0xf) == CEC_OP_PROG_ERROR_DUPLICATE) { if (msg->len < 5) valid_la = false; } break; case CEC_MSG_RECORD_ON: switch (msg->msg[2]) { case CEC_OP_RECORD_SRC_OWN: break; case CEC_OP_RECORD_SRC_DIGITAL: if (msg->len < 10) valid_la = false; break; case CEC_OP_RECORD_SRC_ANALOG: if (msg->len < 7) valid_la = false; break; case CEC_OP_RECORD_SRC_EXT_PLUG: if (msg->len < 4) valid_la = false; break; case CEC_OP_RECORD_SRC_EXT_PHYS_ADDR: if (msg->len < 5) valid_la = false; break; } break; } } /* It's a valid message and not a poll or CDC message */ if (valid_la && msg->len > 1 && cmd != CEC_MSG_CDC_MESSAGE) { bool abort = cmd == CEC_MSG_FEATURE_ABORT; /* The aborted command is in msg[2] */ if (abort) cmd = msg->msg[2]; /* * Walk over all transmitted messages that are waiting for a * reply. */ list_for_each_entry(data, &adap->wait_queue, list) { struct cec_msg *dst = &data->msg; /* * The *only* CEC message that has two possible replies * is CEC_MSG_INITIATE_ARC. * In this case allow either of the two replies. */ if (!abort && dst->msg[1] == CEC_MSG_INITIATE_ARC && (cmd == CEC_MSG_REPORT_ARC_INITIATED || cmd == CEC_MSG_REPORT_ARC_TERMINATED) && (dst->reply == CEC_MSG_REPORT_ARC_INITIATED || dst->reply == CEC_MSG_REPORT_ARC_TERMINATED)) dst->reply = cmd; /* Does the command match? */ if ((abort && cmd != dst->msg[1]) || (!abort && cmd != dst->reply)) continue; /* Does the addressing match? */ if (msg_init != cec_msg_destination(dst) && !cec_msg_is_broadcast(dst)) continue; /* We got a reply */ memcpy(dst->msg, msg->msg, msg->len); dst->len = msg->len; dst->rx_ts = msg->rx_ts; dst->rx_status = msg->rx_status; if (abort) dst->rx_status |= CEC_RX_STATUS_FEATURE_ABORT; msg->flags = dst->flags; msg->sequence = dst->sequence; /* Remove it from the wait_queue */ list_del_init(&data->list); /* Cancel the pending timeout work */ if (!cancel_delayed_work(&data->work)) { mutex_unlock(&adap->lock); cancel_delayed_work_sync(&data->work); mutex_lock(&adap->lock); } /* * Mark this as a reply, provided someone is still * waiting for the answer. */ if (data->fh) is_reply = true; cec_data_completed(data); break; } } mutex_unlock(&adap->lock); /* Pass the message on to any monitoring filehandles */ cec_queue_msg_monitor(adap, msg, monitor_valid_la); /* We're done if it is not for us or a poll message */ if (!valid_la || msg->len <= 1) return; if (adap->log_addrs.log_addr_mask == 0) return; /* * Process the message on the protocol level. If is_reply is true, * then cec_receive_notify() won't pass on the reply to the listener(s) * since that was already done by cec_data_completed() above. */ cec_receive_notify(adap, msg, is_reply); } EXPORT_SYMBOL_GPL(cec_received_msg_ts); /* Logical Address Handling */ /* * Attempt to claim a specific logical address. * * This function is called with adap->lock held. */ static int cec_config_log_addr(struct cec_adapter *adap, unsigned int idx, unsigned int log_addr) { struct cec_log_addrs *las = &adap->log_addrs; struct cec_msg msg = { }; const unsigned int max_retries = 2; unsigned int i; int err; if (cec_has_log_addr(adap, log_addr)) return 0; /* Send poll message */ msg.len = 1; msg.msg[0] = (log_addr << 4) | log_addr; for (i = 0; i < max_retries; i++) { err = cec_transmit_msg_fh(adap, &msg, NULL, true); /* * While trying to poll the physical address was reset * and the adapter was unconfigured, so bail out. */ if (adap->phys_addr == CEC_PHYS_ADDR_INVALID) return -EINTR; /* Also bail out if the PA changed while configuring. */ if (adap->must_reconfigure) return -EINTR; if (err) return err; /* * The message was aborted or timed out due to a disconnect or * unconfigure, just bail out. */ if (msg.tx_status & (CEC_TX_STATUS_ABORTED | CEC_TX_STATUS_TIMEOUT)) return -EINTR; if (msg.tx_status & CEC_TX_STATUS_OK) return 0; if (msg.tx_status & CEC_TX_STATUS_NACK) break; /* * Retry up to max_retries times if the message was neither * OKed or NACKed. This can happen due to e.g. a Lost * Arbitration condition. */ } /* * If we are unable to get an OK or a NACK after max_retries attempts * (and note that each attempt already consists of four polls), then * we assume that something is really weird and that it is not a * good idea to try and claim this logical address. */ if (i == max_retries) { dprintk(0, "polling for LA %u failed with tx_status=0x%04x\n", log_addr, msg.tx_status); return 0; } /* * Message not acknowledged, so this logical * address is free to use. */ err = call_op(adap, adap_log_addr, log_addr); if (err) return err; las->log_addr[idx] = log_addr; las->log_addr_mask |= 1 << log_addr; return 1; } /* * Unconfigure the adapter: clear all logical addresses and send * the state changed event. * * This function is called with adap->lock held. */ static void cec_adap_unconfigure(struct cec_adapter *adap) { if (!adap->needs_hpd || adap->phys_addr != CEC_PHYS_ADDR_INVALID) WARN_ON(call_op(adap, adap_log_addr, CEC_LOG_ADDR_INVALID)); adap->log_addrs.log_addr_mask = 0; adap->is_configured = false; cec_flush(adap); wake_up_interruptible(&adap->kthread_waitq); cec_post_state_event(adap); call_void_op(adap, adap_unconfigured); } /* * Attempt to claim the required logical addresses. */ static int cec_config_thread_func(void *arg) { /* The various LAs for each type of device */ static const u8 tv_log_addrs[] = { CEC_LOG_ADDR_TV, CEC_LOG_ADDR_SPECIFIC, CEC_LOG_ADDR_INVALID }; static const u8 record_log_addrs[] = { CEC_LOG_ADDR_RECORD_1, CEC_LOG_ADDR_RECORD_2, CEC_LOG_ADDR_RECORD_3, CEC_LOG_ADDR_BACKUP_1, CEC_LOG_ADDR_BACKUP_2, CEC_LOG_ADDR_INVALID }; static const u8 tuner_log_addrs[] = { CEC_LOG_ADDR_TUNER_1, CEC_LOG_ADDR_TUNER_2, CEC_LOG_ADDR_TUNER_3, CEC_LOG_ADDR_TUNER_4, CEC_LOG_ADDR_BACKUP_1, CEC_LOG_ADDR_BACKUP_2, CEC_LOG_ADDR_INVALID }; static const u8 playback_log_addrs[] = { CEC_LOG_ADDR_PLAYBACK_1, CEC_LOG_ADDR_PLAYBACK_2, CEC_LOG_ADDR_PLAYBACK_3, CEC_LOG_ADDR_BACKUP_1, CEC_LOG_ADDR_BACKUP_2, CEC_LOG_ADDR_INVALID }; static const u8 audiosystem_log_addrs[] = { CEC_LOG_ADDR_AUDIOSYSTEM, CEC_LOG_ADDR_INVALID }; static const u8 specific_use_log_addrs[] = { CEC_LOG_ADDR_SPECIFIC, CEC_LOG_ADDR_BACKUP_1, CEC_LOG_ADDR_BACKUP_2, CEC_LOG_ADDR_INVALID }; static const u8 *type2addrs[6] = { [CEC_LOG_ADDR_TYPE_TV] = tv_log_addrs, [CEC_LOG_ADDR_TYPE_RECORD] = record_log_addrs, [CEC_LOG_ADDR_TYPE_TUNER] = tuner_log_addrs, [CEC_LOG_ADDR_TYPE_PLAYBACK] = playback_log_addrs, [CEC_LOG_ADDR_TYPE_AUDIOSYSTEM] = audiosystem_log_addrs, [CEC_LOG_ADDR_TYPE_SPECIFIC] = specific_use_log_addrs, }; static const u16 type2mask[] = { [CEC_LOG_ADDR_TYPE_TV] = CEC_LOG_ADDR_MASK_TV, [CEC_LOG_ADDR_TYPE_RECORD] = CEC_LOG_ADDR_MASK_RECORD, [CEC_LOG_ADDR_TYPE_TUNER] = CEC_LOG_ADDR_MASK_TUNER, [CEC_LOG_ADDR_TYPE_PLAYBACK] = CEC_LOG_ADDR_MASK_PLAYBACK, [CEC_LOG_ADDR_TYPE_AUDIOSYSTEM] = CEC_LOG_ADDR_MASK_AUDIOSYSTEM, [CEC_LOG_ADDR_TYPE_SPECIFIC] = CEC_LOG_ADDR_MASK_SPECIFIC, }; struct cec_adapter *adap = arg; struct cec_log_addrs *las = &adap->log_addrs; int err; int i, j; mutex_lock(&adap->lock); dprintk(1, "physical address: %x.%x.%x.%x, claim %d logical addresses\n", cec_phys_addr_exp(adap->phys_addr), las->num_log_addrs); las->log_addr_mask = 0; if (las->log_addr_type[0] == CEC_LOG_ADDR_TYPE_UNREGISTERED) goto configured; reconfigure: for (i = 0; i < las->num_log_addrs; i++) { unsigned int type = las->log_addr_type[i]; const u8 *la_list; u8 last_la; /* * The TV functionality can only map to physical address 0. * For any other address, try the Specific functionality * instead as per the spec. */ if (adap->phys_addr && type == CEC_LOG_ADDR_TYPE_TV) type = CEC_LOG_ADDR_TYPE_SPECIFIC; la_list = type2addrs[type]; last_la = las->log_addr[i]; las->log_addr[i] = CEC_LOG_ADDR_INVALID; if (last_la == CEC_LOG_ADDR_INVALID || last_la == CEC_LOG_ADDR_UNREGISTERED || !((1 << last_la) & type2mask[type])) last_la = la_list[0]; err = cec_config_log_addr(adap, i, last_la); if (adap->must_reconfigure) { adap->must_reconfigure = false; las->log_addr_mask = 0; goto reconfigure; } if (err > 0) /* Reused last LA */ continue; if (err < 0) goto unconfigure; for (j = 0; la_list[j] != CEC_LOG_ADDR_INVALID; j++) { /* Tried this one already, skip it */ if (la_list[j] == last_la) continue; /* The backup addresses are CEC 2.0 specific */ if ((la_list[j] == CEC_LOG_ADDR_BACKUP_1 || la_list[j] == CEC_LOG_ADDR_BACKUP_2) && las->cec_version < CEC_OP_CEC_VERSION_2_0) continue; err = cec_config_log_addr(adap, i, la_list[j]); if (err == 0) /* LA is in use */ continue; if (err < 0) goto unconfigure; /* Done, claimed an LA */ break; } if (la_list[j] == CEC_LOG_ADDR_INVALID) dprintk(1, "could not claim LA %d\n", i); } if (adap->log_addrs.log_addr_mask == 0 && !(las->flags & CEC_LOG_ADDRS_FL_ALLOW_UNREG_FALLBACK)) goto unconfigure; configured: if (adap->log_addrs.log_addr_mask == 0) { /* Fall back to unregistered */ las->log_addr[0] = CEC_LOG_ADDR_UNREGISTERED; las->log_addr_mask = 1 << las->log_addr[0]; for (i = 1; i < las->num_log_addrs; i++) las->log_addr[i] = CEC_LOG_ADDR_INVALID; } for (i = las->num_log_addrs; i < CEC_MAX_LOG_ADDRS; i++) las->log_addr[i] = CEC_LOG_ADDR_INVALID; adap->is_configured = true; adap->is_configuring = false; adap->must_reconfigure = false; cec_post_state_event(adap); /* * Now post the Report Features and Report Physical Address broadcast * messages. Note that these are non-blocking transmits, meaning that * they are just queued up and once adap->lock is unlocked the main * thread will kick in and start transmitting these. * * If after this function is done (but before one or more of these * messages are actually transmitted) the CEC adapter is unconfigured, * then any remaining messages will be dropped by the main thread. */ for (i = 0; i < las->num_log_addrs; i++) { struct cec_msg msg = {}; if (las->log_addr[i] == CEC_LOG_ADDR_INVALID || (las->flags & CEC_LOG_ADDRS_FL_CDC_ONLY)) continue; msg.msg[0] = (las->log_addr[i] << 4) | 0x0f; /* Report Features must come first according to CEC 2.0 */ if (las->log_addr[i] != CEC_LOG_ADDR_UNREGISTERED && adap->log_addrs.cec_version >= CEC_OP_CEC_VERSION_2_0) { cec_fill_msg_report_features(adap, &msg, i); cec_transmit_msg_fh(adap, &msg, NULL, false); } /* Report Physical Address */ cec_msg_report_physical_addr(&msg, adap->phys_addr, las->primary_device_type[i]); dprintk(1, "config: la %d pa %x.%x.%x.%x\n", las->log_addr[i], cec_phys_addr_exp(adap->phys_addr)); cec_transmit_msg_fh(adap, &msg, NULL, false); /* Report Vendor ID */ if (adap->log_addrs.vendor_id != CEC_VENDOR_ID_NONE) { cec_msg_device_vendor_id(&msg, adap->log_addrs.vendor_id); cec_transmit_msg_fh(adap, &msg, NULL, false); } } adap->kthread_config = NULL; complete(&adap->config_completion); mutex_unlock(&adap->lock); call_void_op(adap, configured); return 0; unconfigure: for (i = 0; i < las->num_log_addrs; i++) las->log_addr[i] = CEC_LOG_ADDR_INVALID; cec_adap_unconfigure(adap); adap->is_configuring = false; adap->must_reconfigure = false; adap->kthread_config = NULL; complete(&adap->config_completion); mutex_unlock(&adap->lock); return 0; } /* * Called from either __cec_s_phys_addr or __cec_s_log_addrs to claim the * logical addresses. * * This function is called with adap->lock held. */ static void cec_claim_log_addrs(struct cec_adapter *adap, bool block) { if (WARN_ON(adap->is_configuring || adap->is_configured)) return; init_completion(&adap->config_completion); /* Ready to kick off the thread */ adap->is_configuring = true; adap->kthread_config = kthread_run(cec_config_thread_func, adap, "ceccfg-%s", adap->name); if (IS_ERR(adap->kthread_config)) { adap->kthread_config = NULL; adap->is_configuring = false; } else if (block) { mutex_unlock(&adap->lock); wait_for_completion(&adap->config_completion); mutex_lock(&adap->lock); } } /* * Helper function to enable/disable the CEC adapter. * * This function is called with adap->lock held. */ int cec_adap_enable(struct cec_adapter *adap) { bool enable; int ret = 0; enable = adap->monitor_all_cnt || adap->monitor_pin_cnt || adap->log_addrs.num_log_addrs; if (adap->needs_hpd) enable = enable && adap->phys_addr != CEC_PHYS_ADDR_INVALID; if (adap->devnode.unregistered) enable = false; if (enable == adap->is_enabled) return 0; /* serialize adap_enable */ mutex_lock(&adap->devnode.lock); if (enable) { adap->last_initiator = 0xff; adap->transmit_in_progress = false; adap->tx_low_drive_log_cnt = 0; adap->tx_error_log_cnt = 0; ret = adap->ops->adap_enable(adap, true); if (!ret) { /* * Enable monitor-all/pin modes if needed. We warn, but * continue if this fails as this is not a critical error. */ if (adap->monitor_all_cnt) WARN_ON(call_op(adap, adap_monitor_all_enable, true)); if (adap->monitor_pin_cnt) WARN_ON(call_op(adap, adap_monitor_pin_enable, true)); } } else { /* Disable monitor-all/pin modes if needed (needs_hpd == 1) */ if (adap->monitor_all_cnt) WARN_ON(call_op(adap, adap_monitor_all_enable, false)); if (adap->monitor_pin_cnt) WARN_ON(call_op(adap, adap_monitor_pin_enable, false)); WARN_ON(adap->ops->adap_enable(adap, false)); adap->last_initiator = 0xff; adap->transmit_in_progress = false; adap->transmit_in_progress_aborted = false; if (adap->transmitting) cec_data_cancel(adap->transmitting, CEC_TX_STATUS_ABORTED, 0); } if (!ret) adap->is_enabled = enable; wake_up_interruptible(&adap->kthread_waitq); mutex_unlock(&adap->devnode.lock); return ret; } /* Set a new physical address and send an event notifying userspace of this. * * This function is called with adap->lock held. */ void __cec_s_phys_addr(struct cec_adapter *adap, u16 phys_addr, bool block) { bool becomes_invalid = phys_addr == CEC_PHYS_ADDR_INVALID; bool is_invalid = adap->phys_addr == CEC_PHYS_ADDR_INVALID; if (phys_addr == adap->phys_addr) return; if (!becomes_invalid && adap->devnode.unregistered) return; dprintk(1, "new physical address %x.%x.%x.%x\n", cec_phys_addr_exp(phys_addr)); if (becomes_invalid || !is_invalid) { adap->phys_addr = CEC_PHYS_ADDR_INVALID; cec_post_state_event(adap); cec_adap_unconfigure(adap); if (becomes_invalid) { cec_adap_enable(adap); return; } } adap->phys_addr = phys_addr; if (is_invalid) cec_adap_enable(adap); cec_post_state_event(adap); if (!adap->log_addrs.num_log_addrs) return; if (adap->is_configuring) adap->must_reconfigure = true; else cec_claim_log_addrs(adap, block); } void cec_s_phys_addr(struct cec_adapter *adap, u16 phys_addr, bool block) { if (IS_ERR_OR_NULL(adap)) return; mutex_lock(&adap->lock); __cec_s_phys_addr(adap, phys_addr, block); mutex_unlock(&adap->lock); } EXPORT_SYMBOL_GPL(cec_s_phys_addr); /* * Note: In the drm subsystem, prefer calling (if possible): * * cec_s_phys_addr(adap, connector->display_info.source_physical_address, false); */ void cec_s_phys_addr_from_edid(struct cec_adapter *adap, const struct edid *edid) { u16 pa = CEC_PHYS_ADDR_INVALID; if (edid && edid->extensions) pa = cec_get_edid_phys_addr((const u8 *)edid, EDID_LENGTH * (edid->extensions + 1), NULL); cec_s_phys_addr(adap, pa, false); } EXPORT_SYMBOL_GPL(cec_s_phys_addr_from_edid); void cec_s_conn_info(struct cec_adapter *adap, const struct cec_connector_info *conn_info) { if (IS_ERR_OR_NULL(adap)) return; if (!(adap->capabilities & CEC_CAP_CONNECTOR_INFO)) return; mutex_lock(&adap->lock); if (conn_info) adap->conn_info = *conn_info; else memset(&adap->conn_info, 0, sizeof(adap->conn_info)); cec_post_state_event(adap); mutex_unlock(&adap->lock); } EXPORT_SYMBOL_GPL(cec_s_conn_info); /* * Called from either the ioctl or a driver to set the logical addresses. * * This function is called with adap->lock held. */ int __cec_s_log_addrs(struct cec_adapter *adap, struct cec_log_addrs *log_addrs, bool block) { u16 type_mask = 0; int err; int i; if (adap->devnode.unregistered) return -ENODEV; if (!log_addrs || log_addrs->num_log_addrs == 0) { if (!adap->log_addrs.num_log_addrs) return 0; if (adap->is_configuring || adap->is_configured) cec_adap_unconfigure(adap); adap->log_addrs.num_log_addrs = 0; for (i = 0; i < CEC_MAX_LOG_ADDRS; i++) adap->log_addrs.log_addr[i] = CEC_LOG_ADDR_INVALID; adap->log_addrs.osd_name[0] = '\0'; adap->log_addrs.vendor_id = CEC_VENDOR_ID_NONE; adap->log_addrs.cec_version = CEC_OP_CEC_VERSION_2_0; cec_adap_enable(adap); return 0; } if (log_addrs->flags & CEC_LOG_ADDRS_FL_CDC_ONLY) { /* * Sanitize log_addrs fields if a CDC-Only device is * requested. */ log_addrs->num_log_addrs = 1; log_addrs->osd_name[0] = '\0'; log_addrs->vendor_id = CEC_VENDOR_ID_NONE; log_addrs->log_addr_type[0] = CEC_LOG_ADDR_TYPE_UNREGISTERED; /* * This is just an internal convention since a CDC-Only device * doesn't have to be a switch. But switches already use * unregistered, so it makes some kind of sense to pick this * as the primary device. Since a CDC-Only device never sends * any 'normal' CEC messages this primary device type is never * sent over the CEC bus. */ log_addrs->primary_device_type[0] = CEC_OP_PRIM_DEVTYPE_SWITCH; log_addrs->all_device_types[0] = 0; log_addrs->features[0][0] = 0; log_addrs->features[0][1] = 0; } /* Ensure the osd name is 0-terminated */ log_addrs->osd_name[sizeof(log_addrs->osd_name) - 1] = '\0'; /* Sanity checks */ if (log_addrs->num_log_addrs > adap->available_log_addrs) { dprintk(1, "num_log_addrs > %d\n", adap->available_log_addrs); return -EINVAL; } /* * Vendor ID is a 24 bit number, so check if the value is * within the correct range. */ if (log_addrs->vendor_id != CEC_VENDOR_ID_NONE && (log_addrs->vendor_id & 0xff000000) != 0) { dprintk(1, "invalid vendor ID\n"); return -EINVAL; } if (log_addrs->cec_version != CEC_OP_CEC_VERSION_1_4 && log_addrs->cec_version != CEC_OP_CEC_VERSION_2_0) { dprintk(1, "invalid CEC version\n"); return -EINVAL; } if (log_addrs->num_log_addrs > 1) for (i = 0; i < log_addrs->num_log_addrs; i++) if (log_addrs->log_addr_type[i] == CEC_LOG_ADDR_TYPE_UNREGISTERED) { dprintk(1, "num_log_addrs > 1 can't be combined with unregistered LA\n"); return -EINVAL; } for (i = 0; i < log_addrs->num_log_addrs; i++) { const u8 feature_sz = ARRAY_SIZE(log_addrs->features[0]); u8 *features = log_addrs->features[i]; bool op_is_dev_features = false; unsigned int j; log_addrs->log_addr[i] = CEC_LOG_ADDR_INVALID; if (log_addrs->log_addr_type[i] > CEC_LOG_ADDR_TYPE_UNREGISTERED) { dprintk(1, "unknown logical address type\n"); return -EINVAL; } if (type_mask & (1 << log_addrs->log_addr_type[i])) { dprintk(1, "duplicate logical address type\n"); return -EINVAL; } type_mask |= 1 << log_addrs->log_addr_type[i]; if ((type_mask & (1 << CEC_LOG_ADDR_TYPE_RECORD)) && (type_mask & (1 << CEC_LOG_ADDR_TYPE_PLAYBACK))) { /* Record already contains the playback functionality */ dprintk(1, "invalid record + playback combination\n"); return -EINVAL; } if (log_addrs->primary_device_type[i] > CEC_OP_PRIM_DEVTYPE_PROCESSOR) { dprintk(1, "unknown primary device type\n"); return -EINVAL; } if (log_addrs->primary_device_type[i] == 2) { dprintk(1, "invalid primary device type\n"); return -EINVAL; } for (j = 0; j < feature_sz; j++) { if ((features[j] & 0x80) == 0) { if (op_is_dev_features) break; op_is_dev_features = true; } } if (!op_is_dev_features || j == feature_sz) { dprintk(1, "malformed features\n"); return -EINVAL; } /* Zero unused part of the feature array */ memset(features + j + 1, 0, feature_sz - j - 1); } if (log_addrs->cec_version >= CEC_OP_CEC_VERSION_2_0) { if (log_addrs->num_log_addrs > 2) { dprintk(1, "CEC 2.0 allows no more than 2 logical addresses\n"); return -EINVAL; } if (log_addrs->num_log_addrs == 2) { if (!(type_mask & ((1 << CEC_LOG_ADDR_TYPE_AUDIOSYSTEM) | (1 << CEC_LOG_ADDR_TYPE_TV)))) { dprintk(1, "two LAs is only allowed for audiosystem and TV\n"); return -EINVAL; } if (!(type_mask & ((1 << CEC_LOG_ADDR_TYPE_PLAYBACK) | (1 << CEC_LOG_ADDR_TYPE_RECORD)))) { dprintk(1, "an audiosystem/TV can only be combined with record or playback\n"); return -EINVAL; } } } /* Zero unused LAs */ for (i = log_addrs->num_log_addrs; i < CEC_MAX_LOG_ADDRS; i++) { log_addrs->primary_device_type[i] = 0; log_addrs->log_addr_type[i] = 0; log_addrs->all_device_types[i] = 0; memset(log_addrs->features[i], 0, sizeof(log_addrs->features[i])); } log_addrs->log_addr_mask = adap->log_addrs.log_addr_mask; adap->log_addrs = *log_addrs; err = cec_adap_enable(adap); if (!err && adap->phys_addr != CEC_PHYS_ADDR_INVALID) cec_claim_log_addrs(adap, block); return err; } int cec_s_log_addrs(struct cec_adapter *adap, struct cec_log_addrs *log_addrs, bool block) { int err; mutex_lock(&adap->lock); err = __cec_s_log_addrs(adap, log_addrs, block); mutex_unlock(&adap->lock); return err; } EXPORT_SYMBOL_GPL(cec_s_log_addrs); /* High-level core CEC message handling */ /* Fill in the Report Features message */ static void cec_fill_msg_report_features(struct cec_adapter *adap, struct cec_msg *msg, unsigned int la_idx) { const struct cec_log_addrs *las = &adap->log_addrs; const u8 *features = las->features[la_idx]; bool op_is_dev_features = false; unsigned int idx; /* Report Features */ msg->msg[0] = (las->log_addr[la_idx] << 4) | 0x0f; msg->len = 4; msg->msg[1] = CEC_MSG_REPORT_FEATURES; msg->msg[2] = adap->log_addrs.cec_version; msg->msg[3] = las->all_device_types[la_idx]; /* Write RC Profiles first, then Device Features */ for (idx = 0; idx < ARRAY_SIZE(las->features[0]); idx++) { msg->msg[msg->len++] = features[idx]; if ((features[idx] & CEC_OP_FEAT_EXT) == 0) { if (op_is_dev_features) break; op_is_dev_features = true; } } } /* Transmit the Feature Abort message */ static int cec_feature_abort_reason(struct cec_adapter *adap, struct cec_msg *msg, u8 reason) { struct cec_msg tx_msg = { }; /* * Don't reply with CEC_MSG_FEATURE_ABORT to a CEC_MSG_FEATURE_ABORT * message! */ if (msg->msg[1] == CEC_MSG_FEATURE_ABORT) return 0; /* Don't Feature Abort messages from 'Unregistered' */ if (cec_msg_initiator(msg) == CEC_LOG_ADDR_UNREGISTERED) return 0; cec_msg_set_reply_to(&tx_msg, msg); cec_msg_feature_abort(&tx_msg, msg->msg[1], reason); return cec_transmit_msg(adap, &tx_msg, false); } static int cec_feature_abort(struct cec_adapter *adap, struct cec_msg *msg) { return cec_feature_abort_reason(adap, msg, CEC_OP_ABORT_UNRECOGNIZED_OP); } static int cec_feature_refused(struct cec_adapter *adap, struct cec_msg *msg) { return cec_feature_abort_reason(adap, msg, CEC_OP_ABORT_REFUSED); } /* * Called when a CEC message is received. This function will do any * necessary core processing. The is_reply bool is true if this message * is a reply to an earlier transmit. * * The message is either a broadcast message or a valid directed message. */ static int cec_receive_notify(struct cec_adapter *adap, struct cec_msg *msg, bool is_reply) { bool is_broadcast = cec_msg_is_broadcast(msg); u8 dest_laddr = cec_msg_destination(msg); u8 init_laddr = cec_msg_initiator(msg); u8 devtype = cec_log_addr2dev(adap, dest_laddr); int la_idx = cec_log_addr2idx(adap, dest_laddr); bool from_unregistered = init_laddr == 0xf; struct cec_msg tx_cec_msg = { }; dprintk(2, "%s: %*ph\n", __func__, msg->len, msg->msg); /* If this is a CDC-Only device, then ignore any non-CDC messages */ if (cec_is_cdc_only(&adap->log_addrs) && msg->msg[1] != CEC_MSG_CDC_MESSAGE) return 0; /* Allow drivers to process the message first */ if (adap->ops->received && !adap->devnode.unregistered && adap->ops->received(adap, msg) != -ENOMSG) return 0; /* * REPORT_PHYSICAL_ADDR, CEC_MSG_USER_CONTROL_PRESSED and * CEC_MSG_USER_CONTROL_RELEASED messages always have to be * handled by the CEC core, even if the passthrough mode is on. * The others are just ignored if passthrough mode is on. */ switch (msg->msg[1]) { case CEC_MSG_GET_CEC_VERSION: case CEC_MSG_ABORT: case CEC_MSG_GIVE_DEVICE_POWER_STATUS: case CEC_MSG_GIVE_OSD_NAME: /* * These messages reply with a directed message, so ignore if * the initiator is Unregistered. */ if (!adap->passthrough && from_unregistered) return 0; fallthrough; case CEC_MSG_GIVE_DEVICE_VENDOR_ID: case CEC_MSG_GIVE_FEATURES: case CEC_MSG_GIVE_PHYSICAL_ADDR: /* * Skip processing these messages if the passthrough mode * is on. */ if (adap->passthrough) goto skip_processing; /* Ignore if addressing is wrong */ if (is_broadcast) return 0; break; case CEC_MSG_USER_CONTROL_PRESSED: case CEC_MSG_USER_CONTROL_RELEASED: /* Wrong addressing mode: don't process */ if (is_broadcast || from_unregistered) goto skip_processing; break; case CEC_MSG_REPORT_PHYSICAL_ADDR: /* * This message is always processed, regardless of the * passthrough setting. * * Exception: don't process if wrong addressing mode. */ if (!is_broadcast) goto skip_processing; break; default: break; } cec_msg_set_reply_to(&tx_cec_msg, msg); switch (msg->msg[1]) { /* The following messages are processed but still passed through */ case CEC_MSG_REPORT_PHYSICAL_ADDR: { u16 pa = (msg->msg[2] << 8) | msg->msg[3]; dprintk(1, "reported physical address %x.%x.%x.%x for logical address %d\n", cec_phys_addr_exp(pa), init_laddr); break; } case CEC_MSG_USER_CONTROL_PRESSED: if (!(adap->capabilities & CEC_CAP_RC) || !(adap->log_addrs.flags & CEC_LOG_ADDRS_FL_ALLOW_RC_PASSTHRU)) break; #ifdef CONFIG_MEDIA_CEC_RC switch (msg->msg[2]) { /* * Play function, this message can have variable length * depending on the specific play function that is used. */ case CEC_OP_UI_CMD_PLAY_FUNCTION: if (msg->len == 2) rc_keydown(adap->rc, RC_PROTO_CEC, msg->msg[2], 0); else rc_keydown(adap->rc, RC_PROTO_CEC, msg->msg[2] << 8 | msg->msg[3], 0); break; /* * Other function messages that are not handled. * Currently the RC framework does not allow to supply an * additional parameter to a keypress. These "keys" contain * other information such as channel number, an input number * etc. * For the time being these messages are not processed by the * framework and are simply forwarded to the user space. */ case CEC_OP_UI_CMD_SELECT_BROADCAST_TYPE: case CEC_OP_UI_CMD_SELECT_SOUND_PRESENTATION: case CEC_OP_UI_CMD_TUNE_FUNCTION: case CEC_OP_UI_CMD_SELECT_MEDIA_FUNCTION: case CEC_OP_UI_CMD_SELECT_AV_INPUT_FUNCTION: case CEC_OP_UI_CMD_SELECT_AUDIO_INPUT_FUNCTION: break; default: rc_keydown(adap->rc, RC_PROTO_CEC, msg->msg[2], 0); break; } #endif break; case CEC_MSG_USER_CONTROL_RELEASED: if (!(adap->capabilities & CEC_CAP_RC) || !(adap->log_addrs.flags & CEC_LOG_ADDRS_FL_ALLOW_RC_PASSTHRU)) break; #ifdef CONFIG_MEDIA_CEC_RC rc_keyup(adap->rc); #endif break; /* * The remaining messages are only processed if the passthrough mode * is off. */ case CEC_MSG_GET_CEC_VERSION: cec_msg_cec_version(&tx_cec_msg, adap->log_addrs.cec_version); return cec_transmit_msg(adap, &tx_cec_msg, false); case CEC_MSG_GIVE_PHYSICAL_ADDR: /* Do nothing for CEC switches using addr 15 */ if (devtype == CEC_OP_PRIM_DEVTYPE_SWITCH && dest_laddr == 15) return 0; cec_msg_report_physical_addr(&tx_cec_msg, adap->phys_addr, devtype); return cec_transmit_msg(adap, &tx_cec_msg, false); case CEC_MSG_GIVE_DEVICE_VENDOR_ID: if (adap->log_addrs.vendor_id == CEC_VENDOR_ID_NONE) return cec_feature_abort(adap, msg); cec_msg_device_vendor_id(&tx_cec_msg, adap->log_addrs.vendor_id); return cec_transmit_msg(adap, &tx_cec_msg, false); case CEC_MSG_ABORT: /* Do nothing for CEC switches */ if (devtype == CEC_OP_PRIM_DEVTYPE_SWITCH) return 0; return cec_feature_refused(adap, msg); case CEC_MSG_GIVE_OSD_NAME: { if (adap->log_addrs.osd_name[0] == 0) return cec_feature_abort(adap, msg); cec_msg_set_osd_name(&tx_cec_msg, adap->log_addrs.osd_name); return cec_transmit_msg(adap, &tx_cec_msg, false); } case CEC_MSG_GIVE_FEATURES: if (adap->log_addrs.cec_version < CEC_OP_CEC_VERSION_2_0) return cec_feature_abort(adap, msg); cec_fill_msg_report_features(adap, &tx_cec_msg, la_idx); return cec_transmit_msg(adap, &tx_cec_msg, false); default: /* * Unprocessed messages are aborted if userspace isn't doing * any processing either. */ if (!is_broadcast && !is_reply && !adap->follower_cnt && !adap->cec_follower && msg->msg[1] != CEC_MSG_FEATURE_ABORT) return cec_feature_abort(adap, msg); break; } skip_processing: /* If this was a reply, then we're done, unless otherwise specified */ if (is_reply && !(msg->flags & CEC_MSG_FL_REPLY_TO_FOLLOWERS)) return 0; /* * Send to the exclusive follower if there is one, otherwise send * to all followers. */ if (adap->cec_follower) cec_queue_msg_fh(adap->cec_follower, msg); else cec_queue_msg_followers(adap, msg); return 0; } /* * Helper functions to keep track of the 'monitor all' use count. * * These functions are called with adap->lock held. */ int cec_monitor_all_cnt_inc(struct cec_adapter *adap) { int ret; if (adap->monitor_all_cnt++) return 0; ret = cec_adap_enable(adap); if (ret) adap->monitor_all_cnt--; return ret; } void cec_monitor_all_cnt_dec(struct cec_adapter *adap) { if (WARN_ON(!adap->monitor_all_cnt)) return; if (--adap->monitor_all_cnt) return; WARN_ON(call_op(adap, adap_monitor_all_enable, false)); cec_adap_enable(adap); } /* * Helper functions to keep track of the 'monitor pin' use count. * * These functions are called with adap->lock held. */ int cec_monitor_pin_cnt_inc(struct cec_adapter *adap) { int ret; if (adap->monitor_pin_cnt++) return 0; ret = cec_adap_enable(adap); if (ret) adap->monitor_pin_cnt--; return ret; } void cec_monitor_pin_cnt_dec(struct cec_adapter *adap) { if (WARN_ON(!adap->monitor_pin_cnt)) return; if (--adap->monitor_pin_cnt) return; WARN_ON(call_op(adap, adap_monitor_pin_enable, false)); cec_adap_enable(adap); } #ifdef CONFIG_DEBUG_FS /* * Log the current state of the CEC adapter. * Very useful for debugging. */ int cec_adap_status(struct seq_file *file, void *priv) { struct cec_adapter *adap = dev_get_drvdata(file->private); struct cec_data *data; mutex_lock(&adap->lock); seq_printf(file, "enabled: %d\n", adap->is_enabled); seq_printf(file, "configured: %d\n", adap->is_configured); seq_printf(file, "configuring: %d\n", adap->is_configuring); seq_printf(file, "phys_addr: %x.%x.%x.%x\n", cec_phys_addr_exp(adap->phys_addr)); seq_printf(file, "number of LAs: %d\n", adap->log_addrs.num_log_addrs); seq_printf(file, "LA mask: 0x%04x\n", adap->log_addrs.log_addr_mask); if (adap->cec_follower) seq_printf(file, "has CEC follower%s\n", adap->passthrough ? " (in passthrough mode)" : ""); if (adap->cec_initiator) seq_puts(file, "has CEC initiator\n"); if (adap->monitor_all_cnt) seq_printf(file, "file handles in Monitor All mode: %u\n", adap->monitor_all_cnt); if (adap->monitor_pin_cnt) seq_printf(file, "file handles in Monitor Pin mode: %u\n", adap->monitor_pin_cnt); if (adap->tx_timeout_cnt) { seq_printf(file, "transmit timeout count: %u\n", adap->tx_timeout_cnt); adap->tx_timeout_cnt = 0; } if (adap->tx_low_drive_cnt) { seq_printf(file, "transmit low drive count: %u\n", adap->tx_low_drive_cnt); adap->tx_low_drive_cnt = 0; } if (adap->tx_arb_lost_cnt) { seq_printf(file, "transmit arbitration lost count: %u\n", adap->tx_arb_lost_cnt); adap->tx_arb_lost_cnt = 0; } if (adap->tx_error_cnt) { seq_printf(file, "transmit error count: %u\n", adap->tx_error_cnt); adap->tx_error_cnt = 0; } data = adap->transmitting; if (data) seq_printf(file, "transmitting message: %*ph (reply: %02x, timeout: %ums)\n", data->msg.len, data->msg.msg, data->msg.reply, data->msg.timeout); seq_printf(file, "pending transmits: %u\n", adap->transmit_queue_sz); list_for_each_entry(data, &adap->transmit_queue, list) { seq_printf(file, "queued tx message: %*ph (reply: %02x, timeout: %ums)\n", data->msg.len, data->msg.msg, data->msg.reply, data->msg.timeout); } list_for_each_entry(data, &adap->wait_queue, list) { seq_printf(file, "message waiting for reply: %*ph (reply: %02x, timeout: %ums)\n", data->msg.len, data->msg.msg, data->msg.reply, data->msg.timeout); } call_void_op(adap, adap_status, file); mutex_unlock(&adap->lock); return 0; } #endif
185 185 144 144 2 40 28 7 5 4 29 29 4 17 8 25 25 1 1 4 24 24 24 24 222 1366 1 20 1 1565 148 1438 51 6 45 41 41 41 41 41 144 144 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 fragment reassembly for connection tracking * * Copyright (C)2004 USAGI/WIDE Project * * Author: * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * * Based on: net/ipv6/reassembly.c */ #define pr_fmt(fmt) "IPv6-nf: " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/string.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/ipv6.h> #include <linux/slab.h> #include <net/ipv6_frag.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> #include <linux/sysctl.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <linux/kernel.h> #include <linux/module.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <net/netns/generic.h> static const char nf_frags_cache_name[] = "nf-frags"; static unsigned int nf_frag_pernet_id __read_mostly; static struct inet_frags nf_frags; static struct nft_ct_frag6_pernet *nf_frag_pernet(struct net *net) { return net_generic(net, nf_frag_pernet_id); } #ifdef CONFIG_SYSCTL static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_frag6_low_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "nf_conntrack_frag6_high_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { } }; static int nf_ct_frag6_sysctl_register(struct net *net) { struct nft_ct_frag6_pernet *nf_frag; struct ctl_table *table; struct ctl_table_header *hdr; table = nf_ct_frag6_sysctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(nf_ct_frag6_sysctl_table), GFP_KERNEL); if (table == NULL) goto err_alloc; } nf_frag = nf_frag_pernet(net); table[0].data = &nf_frag->fqdir->timeout; table[1].data = &nf_frag->fqdir->low_thresh; table[1].extra2 = &nf_frag->fqdir->high_thresh; table[2].data = &nf_frag->fqdir->high_thresh; table[2].extra1 = &nf_frag->fqdir->low_thresh; hdr = register_net_sysctl_sz(net, "net/netfilter", table, ARRAY_SIZE(nf_ct_frag6_sysctl_table)); if (hdr == NULL) goto err_reg; nf_frag->nf_frag_frags_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); struct ctl_table *table; table = nf_frag->nf_frag_frags_hdr->ctl_table_arg; unregister_net_sysctl_table(nf_frag->nf_frag_frags_hdr); if (!net_eq(net, &init_net)) kfree(table); } #else static int nf_ct_frag6_sysctl_register(struct net *net) { return 0; } static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { } #endif static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev); static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); } static void nf_ct_frag6_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; fq = container_of(frag, struct frag_queue, q); ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } /* Creation primitives. */ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, const struct ipv6hdr *hdr, int iif) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); struct frag_v6_compare_key key = { .id = id, .saddr = hdr->saddr, .daddr = hdr->daddr, .user = user, .iif = iif, }; struct inet_frag_queue *q; q = inet_frag_find(nf_frag->fqdir, &key); if (!q) return NULL; return container_of(q, struct frag_queue, q); } static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, const struct frag_hdr *fhdr, int nhoff) { unsigned int payload_len; struct net_device *dev; struct sk_buff *prev; int offset, end, err; u8 ecn; if (fq->q.flags & INET_FRAG_COMPLETE) { pr_debug("Already completed\n"); goto err; } payload_len = ntohs(ipv6_hdr(skb)->payload_len); offset = ntohs(fhdr->frag_off) & ~0x7; end = offset + (payload_len - ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { pr_debug("offset is too large.\n"); return -EINVAL; } ecn = ip6_frag_ecn(ipv6_hdr(skb)); if (skb->ip_summed == CHECKSUM_COMPLETE) { const unsigned char *nh = skb_network_header(skb); skb->csum = csum_sub(skb->csum, csum_partial(nh, (u8 *)(fhdr + 1) - nh, 0)); } /* Is this the final fragment? */ if (!(fhdr->frag_off & htons(IP6_MF))) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. */ if (end < fq->q.len || ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) { pr_debug("already received last fragment\n"); goto err; } fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { /* Check if the fragment is rounded to 8 bytes. * Required by the RFC. */ if (end & 0x7) { /* RFC2460 says always send parameter problem in * this case. -DaveM */ pr_debug("end of fragment not rounded to 8 bytes.\n"); inet_frag_kill(&fq->q); return -EPROTO; } if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ if (fq->q.flags & INET_FRAG_LAST_IN) { pr_debug("last packet already reached.\n"); goto err; } fq->q.len = end; } } if (end == offset) goto err; /* Point into the IP datagram 'data' part. */ if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) { pr_debug("queue: message is too short.\n"); goto err; } if (pskb_trim_rcsum(skb, end - offset)) { pr_debug("Can't trim\n"); goto err; } /* Note : skb->rbnode and skb->dev share the same location. */ dev = skb->dev; /* Makes sure compiler wont do silly aliasing games */ barrier(); prev = fq->q.fragments_tail; err = inet_frag_queue_insert(&fq->q, skb, offset, end); if (err) { if (err == IPFRAG_DUP) { /* No error for duplicates, pretend they got queued. */ kfree_skb_reason(skb, SKB_DROP_REASON_DUP_FRAG); return -EINPROGRESS; } goto insert_error; } if (dev) fq->iif = dev->ifindex; fq->q.stamp = skb->tstamp; fq->q.mono_delivery_time = skb->mono_delivery_time; fq->q.meat += skb->len; fq->ecn |= ecn; if (payload_len > fq->q.max_size) fq->q.max_size = payload_len; add_frag_mem_limit(fq->q.fqdir, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. */ if (offset == 0) { fq->nhoffset = nhoff; fq->q.flags |= INET_FRAG_FIRST_IN; } if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; err = nf_ct_frag6_reasm(fq, skb, prev, dev); skb->_skb_refdst = orefdst; /* After queue has assumed skb ownership, only 0 or * -EINPROGRESS must be returned. */ return err ? -EINPROGRESS : 0; } skb_dst_drop(skb); return -EINPROGRESS; insert_error: inet_frag_kill(&fq->q); err: skb_dst_drop(skb); return -EINVAL; } /* * Check if this packet is complete. * * It is called with locked fq, and caller must check that * queue is eligible for reassembly i.e. it is not COMPLETE, * the last and the first frames arrived and all the bits are here. */ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { void *reasm_data; int payload_len; u8 ecn; inet_frag_kill(&fq->q); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) goto err; reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); if (!reasm_data) goto err; payload_len = ((skb->data - skb_network_header(skb)) - sizeof(struct ipv6hdr) + fq->q.len - sizeof(struct frag_hdr)); if (payload_len > IPV6_MAXPLEN) { net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n", payload_len); goto err; } /* We have to remove fragment header from datagram and to relocate * header in order to calculate ICV correctly. */ skb_network_header(skb)[fq->nhoffset] = skb_transport_header(skb)[0]; memmove(skb->head + sizeof(struct frag_hdr), skb->head, (skb->data - skb->head) - sizeof(struct frag_hdr)); skb->mac_header += sizeof(struct frag_hdr); skb->network_header += sizeof(struct frag_hdr); skb_reset_transport_header(skb); inet_frag_reasm_finish(&fq->q, skb, reasm_data, false); skb->ignore_df = 1; skb->dev = dev; ipv6_hdr(skb)->payload_len = htons(payload_len); ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; /* Yes, and fold redundant checksum back. 8) */ if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_partial(skb_network_header(skb), skb_network_header_len(skb), skb->csum); fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; fq->q.last_run_head = NULL; return 0; err: inet_frag_kill(&fq->q); return -EINVAL; } /* * find the header just before Fragment Header. * * if success return 0 and set ... * (*prevhdrp): the value of "Next Header Field" in the header * just before Fragment Header. * (*prevhoff): the offset of "Next Header Field" in the header * just before Fragment Header. * (*fhoff) : the offset of Fragment Header. * * Based on ipv6_skip_hdr() in net/ipv6/exthdr.c * */ static int find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) { u8 nexthdr = ipv6_hdr(skb)->nexthdr; const int netoff = skb_network_offset(skb); u8 prev_nhoff = netoff + offsetof(struct ipv6hdr, nexthdr); int start = netoff + sizeof(struct ipv6hdr); int len = skb->len - start; u8 prevhdr = NEXTHDR_IPV6; while (nexthdr != NEXTHDR_FRAGMENT) { struct ipv6_opt_hdr hdr; int hdrlen; if (!ipv6_ext_hdr(nexthdr)) { return -1; } if (nexthdr == NEXTHDR_NONE) { pr_debug("next header is none\n"); return -1; } if (len < (int)sizeof(struct ipv6_opt_hdr)) { pr_debug("too short\n"); return -1; } if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) BUG(); if (nexthdr == NEXTHDR_AUTH) hdrlen = ipv6_authlen(&hdr); else hdrlen = ipv6_optlen(&hdr); prevhdr = nexthdr; prev_nhoff = start; nexthdr = hdr.nexthdr; len -= hdrlen; start += hdrlen; } if (len < 0) return -1; *prevhdrp = prevhdr; *prevhoff = prev_nhoff; *fhoff = start; return 0; } int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) { u16 savethdr = skb->transport_header; u8 nexthdr = NEXTHDR_FRAGMENT; int fhoff, nhoff, ret; struct frag_hdr *fhdr; struct frag_queue *fq; struct ipv6hdr *hdr; u8 prevhdr; /* Jumbo payload inhibits frag. header */ if (ipv6_hdr(skb)->payload_len == 0) { pr_debug("payload len = 0\n"); return 0; } if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0) return 0; /* Discard the first fragment if it does not include all headers * RFC 8200, Section 4.5 */ if (ipv6frag_thdr_truncated(skb, fhoff, &nexthdr)) { pr_debug("Drop incomplete fragment\n"); return 0; } if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr))) return -ENOMEM; skb_set_transport_header(skb, fhoff); hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); skb_orphan(skb); fq = fq_find(net, fhdr->identification, user, hdr, skb->dev ? skb->dev->ifindex : 0); if (fq == NULL) { pr_debug("Can't find and can't create new queue\n"); return -ENOMEM; } spin_lock_bh(&fq->q.lock); ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff); if (ret == -EPROTO) { skb->transport_header = savethdr; ret = 0; } spin_unlock_bh(&fq->q.lock); inet_frag_put(&fq->q); return ret; } EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); static int nf_ct_net_init(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); int res; res = fqdir_init(&nf_frag->fqdir, &nf_frags, net); if (res < 0) return res; nf_frag->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; nf_frag->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; nf_frag->fqdir->timeout = IPV6_FRAG_TIMEOUT; res = nf_ct_frag6_sysctl_register(net); if (res < 0) fqdir_exit(nf_frag->fqdir); return res; } static void nf_ct_net_pre_exit(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); fqdir_pre_exit(nf_frag->fqdir); } static void nf_ct_net_exit(struct net *net) { struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); nf_ct_frags6_sysctl_unregister(net); fqdir_exit(nf_frag->fqdir); } static struct pernet_operations nf_ct_net_ops = { .init = nf_ct_net_init, .pre_exit = nf_ct_net_pre_exit, .exit = nf_ct_net_exit, .id = &nf_frag_pernet_id, .size = sizeof(struct nft_ct_frag6_pernet), }; static const struct rhashtable_params nfct_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), .hashfn = ip6frag_key_hashfn, .obj_hashfn = ip6frag_obj_hashfn, .obj_cmpfn = ip6frag_obj_cmpfn, .automatic_shrinking = true, }; int nf_ct_frag6_init(void) { int ret = 0; nf_frags.constructor = ip6frag_init; nf_frags.destructor = NULL; nf_frags.qsize = sizeof(struct frag_queue); nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.frags_cache_name = nf_frags_cache_name; nf_frags.rhash_params = nfct_rhash_params; ret = inet_frags_init(&nf_frags); if (ret) goto out; ret = register_pernet_subsys(&nf_ct_net_ops); if (ret) inet_frags_fini(&nf_frags); out: return ret; } void nf_ct_frag6_cleanup(void) { unregister_pernet_subsys(&nf_ct_net_ops); inet_frags_fini(&nf_frags); }
199 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_ERR_H #define _LINUX_ERR_H #include <linux/compiler.h> #include <linux/types.h> #include <asm/errno.h> /* * Kernel pointers have redundant information, so we can use a * scheme where we can return either an error code or a normal * pointer with the same return value. * * This should be a per-architecture thing, to allow different * error and pointer decisions. */ #define MAX_ERRNO 4095 #ifndef __ASSEMBLY__ /** * IS_ERR_VALUE - Detect an error pointer. * @x: The pointer to check. * * Like IS_ERR(), but does not generate a compiler warning if result is unused. */ #define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO) /** * ERR_PTR - Create an error pointer. * @error: A negative error code. * * Encodes @error into a pointer value. Users should consider the result * opaque and not assume anything about how the error is encoded. * * Return: A pointer with @error encoded within its value. */ static inline void * __must_check ERR_PTR(long error) { return (void *) error; } /** * PTR_ERR - Extract the error code from an error pointer. * @ptr: An error pointer. * Return: The error code within @ptr. */ static inline long __must_check PTR_ERR(__force const void *ptr) { return (long) ptr; } /** * IS_ERR - Detect an error pointer. * @ptr: The pointer to check. * Return: true if @ptr is an error pointer, false otherwise. */ static inline bool __must_check IS_ERR(__force const void *ptr) { return IS_ERR_VALUE((unsigned long)ptr); } /** * IS_ERR_OR_NULL - Detect an error pointer or a null pointer. * @ptr: The pointer to check. * * Like IS_ERR(), but also returns true for a null pointer. */ static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) { return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr); } /** * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type * @ptr: The pointer to cast. * * Explicitly cast an error-valued pointer to another pointer type in such a * way as to make it clear that's what's going on. */ static inline void * __must_check ERR_CAST(__force const void *ptr) { /* cast away the const */ return (void *) ptr; } /** * PTR_ERR_OR_ZERO - Extract the error code from a pointer if it has one. * @ptr: A potential error pointer. * * Convenience function that can be used inside a function that returns * an error code to propagate errors received as error pointers. * For example, ``return PTR_ERR_OR_ZERO(ptr);`` replaces: * * .. code-block:: c * * if (IS_ERR(ptr)) * return PTR_ERR(ptr); * else * return 0; * * Return: The error code within @ptr if it is an error pointer; 0 otherwise. */ static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr) { if (IS_ERR(ptr)) return PTR_ERR(ptr); else return 0; } #endif #endif /* _LINUX_ERR_H */
95 1 94 1 5 5 2 5 5 1 4 4 4 4 4 4 4 8 8 1 7 7 4 4 5 7 4 2 258 258 258 7 3 7 118 118 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * This module enables kernel and guest-mode vCPU access to guest physical * memory with suitable invalidation mechanisms. * * Copyright © 2021 Amazon.com, Inc. or its affiliates. * * Authors: * David Woodhouse <dwmw2@infradead.org> */ #include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/highmem.h> #include <linux/module.h> #include <linux/errno.h> #include "kvm_mm.h" /* * MMU notifier 'invalidate_range_start' hook. */ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, unsigned long end, bool may_block) { DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); struct gfn_to_pfn_cache *gpc; bool evict_vcpus = false; spin_lock(&kvm->gpc_lock); list_for_each_entry(gpc, &kvm->gpc_list, list) { write_lock_irq(&gpc->lock); /* Only a single page so no need to care about length */ if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) && gpc->uhva >= start && gpc->uhva < end) { gpc->valid = false; /* * If a guest vCPU could be using the physical address, * it needs to be forced out of guest mode. */ if (gpc->usage & KVM_GUEST_USES_PFN) { if (!evict_vcpus) { evict_vcpus = true; bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); } __set_bit(gpc->vcpu->vcpu_idx, vcpu_bitmap); } } write_unlock_irq(&gpc->lock); } spin_unlock(&kvm->gpc_lock); if (evict_vcpus) { /* * KVM needs to ensure the vCPU is fully out of guest context * before allowing the invalidation to continue. */ unsigned int req = KVM_REQ_OUTSIDE_GUEST_MODE; bool called; /* * If the OOM reaper is active, then all vCPUs should have * been stopped already, so perform the request without * KVM_REQUEST_WAIT and be sad if any needed to be IPI'd. */ if (!may_block) req &= ~KVM_REQUEST_WAIT; called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap); WARN_ON_ONCE(called && !may_block); } } bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(gpc->kvm); if (!gpc->active) return false; if ((gpc->gpa & ~PAGE_MASK) + len > PAGE_SIZE) return false; if (gpc->generation != slots->generation || kvm_is_error_hva(gpc->uhva)) return false; if (!gpc->valid) return false; return true; } EXPORT_SYMBOL_GPL(kvm_gpc_check); static void gpc_unmap_khva(kvm_pfn_t pfn, void *khva) { /* Unmap the old pfn/page if it was mapped before. */ if (!is_error_noslot_pfn(pfn) && khva) { if (pfn_valid(pfn)) kunmap(pfn_to_page(pfn)); #ifdef CONFIG_HAS_IOMEM else memunmap(khva); #endif } } static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq) { /* * mn_active_invalidate_count acts for all intents and purposes * like mmu_invalidate_in_progress here; but the latter cannot * be used here because the invalidation of caches in the * mmu_notifier event occurs _before_ mmu_invalidate_in_progress * is elevated. * * Note, it does not matter that mn_active_invalidate_count * is not protected by gpc->lock. It is guaranteed to * be elevated before the mmu_notifier acquires gpc->lock, and * isn't dropped until after mmu_invalidate_seq is updated. */ if (kvm->mn_active_invalidate_count) return true; /* * Ensure mn_active_invalidate_count is read before * mmu_invalidate_seq. This pairs with the smp_wmb() in * mmu_notifier_invalidate_range_end() to guarantee either the * old (non-zero) value of mn_active_invalidate_count or the * new (incremented) value of mmu_invalidate_seq is observed. */ smp_rmb(); return kvm->mmu_invalidate_seq != mmu_seq; } static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) { /* Note, the new page offset may be different than the old! */ void *old_khva = gpc->khva - offset_in_page(gpc->khva); kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; void *new_khva = NULL; unsigned long mmu_seq; lockdep_assert_held(&gpc->refresh_lock); lockdep_assert_held_write(&gpc->lock); /* * Invalidate the cache prior to dropping gpc->lock, the gpa=>uhva * assets have already been updated and so a concurrent check() from a * different task may not fail the gpa/uhva/generation checks. */ gpc->valid = false; do { mmu_seq = gpc->kvm->mmu_invalidate_seq; smp_rmb(); write_unlock_irq(&gpc->lock); /* * If the previous iteration "failed" due to an mmu_notifier * event, release the pfn and unmap the kernel virtual address * from the previous attempt. Unmapping might sleep, so this * needs to be done after dropping the lock. Opportunistically * check for resched while the lock isn't held. */ if (new_pfn != KVM_PFN_ERR_FAULT) { /* * Keep the mapping if the previous iteration reused * the existing mapping and didn't create a new one. */ if (new_khva != old_khva) gpc_unmap_khva(new_pfn, new_khva); kvm_release_pfn_clean(new_pfn); cond_resched(); } /* We always request a writeable mapping */ new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL); if (is_error_noslot_pfn(new_pfn)) goto out_error; /* * Obtain a new kernel mapping if KVM itself will access the * pfn. Note, kmap() and memremap() can both sleep, so this * too must be done outside of gpc->lock! */ if (gpc->usage & KVM_HOST_USES_PFN) { if (new_pfn == gpc->pfn) { new_khva = old_khva; } else if (pfn_valid(new_pfn)) { new_khva = kmap(pfn_to_page(new_pfn)); #ifdef CONFIG_HAS_IOMEM } else { new_khva = memremap(pfn_to_hpa(new_pfn), PAGE_SIZE, MEMREMAP_WB); #endif } if (!new_khva) { kvm_release_pfn_clean(new_pfn); goto out_error; } } write_lock_irq(&gpc->lock); /* * Other tasks must wait for _this_ refresh to complete before * attempting to refresh. */ WARN_ON_ONCE(gpc->valid); } while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq)); gpc->valid = true; gpc->pfn = new_pfn; gpc->khva = new_khva + (gpc->gpa & ~PAGE_MASK); /* * Put the reference to the _new_ pfn. The pfn is now tracked by the * cache and can be safely migrated, swapped, etc... as the cache will * invalidate any mappings in response to relevant mmu_notifier events. */ kvm_release_pfn_clean(new_pfn); return 0; out_error: write_lock_irq(&gpc->lock); return -EFAULT; } static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) { struct kvm_memslots *slots = kvm_memslots(gpc->kvm); unsigned long page_offset = gpa & ~PAGE_MASK; bool unmap_old = false; unsigned long old_uhva; kvm_pfn_t old_pfn; void *old_khva; int ret; /* * If must fit within a single page. The 'len' argument is * only to enforce that. */ if (page_offset + len > PAGE_SIZE) return -EINVAL; /* * If another task is refreshing the cache, wait for it to complete. * There is no guarantee that concurrent refreshes will see the same * gpa, memslots generation, etc..., so they must be fully serialized. */ mutex_lock(&gpc->refresh_lock); write_lock_irq(&gpc->lock); if (!gpc->active) { ret = -EINVAL; goto out_unlock; } old_pfn = gpc->pfn; old_khva = gpc->khva - offset_in_page(gpc->khva); old_uhva = gpc->uhva; /* If the userspace HVA is invalid, refresh that first */ if (gpc->gpa != gpa || gpc->generation != slots->generation || kvm_is_error_hva(gpc->uhva)) { gfn_t gfn = gpa_to_gfn(gpa); gpc->gpa = gpa; gpc->generation = slots->generation; gpc->memslot = __gfn_to_memslot(slots, gfn); gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn); if (kvm_is_error_hva(gpc->uhva)) { ret = -EFAULT; goto out; } } /* * If the userspace HVA changed or the PFN was already invalid, * drop the lock and do the HVA to PFN lookup again. */ if (!gpc->valid || old_uhva != gpc->uhva) { ret = hva_to_pfn_retry(gpc); } else { /* * If the HVA→PFN mapping was already valid, don't unmap it. * But do update gpc->khva because the offset within the page * may have changed. */ gpc->khva = old_khva + page_offset; ret = 0; goto out_unlock; } out: /* * Invalidate the cache and purge the pfn/khva if the refresh failed. * Some/all of the uhva, gpa, and memslot generation info may still be * valid, leave it as is. */ if (ret) { gpc->valid = false; gpc->pfn = KVM_PFN_ERR_FAULT; gpc->khva = NULL; } /* Detect a pfn change before dropping the lock! */ unmap_old = (old_pfn != gpc->pfn); out_unlock: write_unlock_irq(&gpc->lock); mutex_unlock(&gpc->refresh_lock); if (unmap_old) gpc_unmap_khva(old_pfn, old_khva); return ret; } int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len) { return __kvm_gpc_refresh(gpc, gpc->gpa, len); } EXPORT_SYMBOL_GPL(kvm_gpc_refresh); void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm, struct kvm_vcpu *vcpu, enum pfn_cache_usage usage) { WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage); WARN_ON_ONCE((usage & KVM_GUEST_USES_PFN) && !vcpu); rwlock_init(&gpc->lock); mutex_init(&gpc->refresh_lock); gpc->kvm = kvm; gpc->vcpu = vcpu; gpc->usage = usage; gpc->pfn = KVM_PFN_ERR_FAULT; gpc->uhva = KVM_HVA_ERR_BAD; } EXPORT_SYMBOL_GPL(kvm_gpc_init); int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len) { struct kvm *kvm = gpc->kvm; if (!gpc->active) { if (KVM_BUG_ON(gpc->valid, kvm)) return -EIO; spin_lock(&kvm->gpc_lock); list_add(&gpc->list, &kvm->gpc_list); spin_unlock(&kvm->gpc_lock); /* * Activate the cache after adding it to the list, a concurrent * refresh must not establish a mapping until the cache is * reachable by mmu_notifier events. */ write_lock_irq(&gpc->lock); gpc->active = true; write_unlock_irq(&gpc->lock); } return __kvm_gpc_refresh(gpc, gpa, len); } EXPORT_SYMBOL_GPL(kvm_gpc_activate); void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc) { struct kvm *kvm = gpc->kvm; kvm_pfn_t old_pfn; void *old_khva; if (gpc->active) { /* * Deactivate the cache before removing it from the list, KVM * must stall mmu_notifier events until all users go away, i.e. * until gpc->lock is dropped and refresh is guaranteed to fail. */ write_lock_irq(&gpc->lock); gpc->active = false; gpc->valid = false; /* * Leave the GPA => uHVA cache intact, it's protected by the * memslot generation. The PFN lookup needs to be redone every * time as mmu_notifier protection is lost when the cache is * removed from the VM's gpc_list. */ old_khva = gpc->khva - offset_in_page(gpc->khva); gpc->khva = NULL; old_pfn = gpc->pfn; gpc->pfn = KVM_PFN_ERR_FAULT; write_unlock_irq(&gpc->lock); spin_lock(&kvm->gpc_lock); list_del(&gpc->list); spin_unlock(&kvm->gpc_lock); gpc_unmap_khva(old_pfn, old_khva); } } EXPORT_SYMBOL_GPL(kvm_gpc_deactivate);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 /* * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de> * * For licencing details see kernel-base/COPYING */ #include <linux/init.h> #include <linux/ioport.h> #include <linux/export.h> #include <linux/pci.h> #include <asm/acpi.h> #include <asm/bios_ebda.h> #include <asm/paravirt.h> #include <asm/pci_x86.h> #include <asm/mpspec.h> #include <asm/setup.h> #include <asm/apic.h> #include <asm/e820/api.h> #include <asm/time.h> #include <asm/irq.h> #include <asm/io_apic.h> #include <asm/hpet.h> #include <asm/memtype.h> #include <asm/tsc.h> #include <asm/iommu.h> #include <asm/mach_traps.h> #include <asm/irqdomain.h> #include <asm/realmode.h> void x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } static int __init iommu_init_noop(void) { return 0; } static void iommu_shutdown_noop(void) { } bool __init bool_x86_init_noop(void) { return false; } void x86_op_int_noop(int cpu) { } int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } void get_rtc_noop(struct timespec64 *now) { } static __initconst const struct of_device_id of_cmos_match[] = { { .compatible = "motorola,mc146818" }, {} }; /* * Allow devicetree configured systems to disable the RTC by setting the * corresponding DT node's status property to disabled. Code is optimized * out for CONFIG_OF=n builds. */ static __init void x86_wallclock_init(void) { struct device_node *node = of_find_matching_node(NULL, of_cmos_match); if (node && !of_device_is_available(node)) { x86_platform.get_wallclock = get_rtc_noop; x86_platform.set_wallclock = set_rtc_noop; } } /* * The platform setup functions are preset with the default functions * for standard PC hardware. */ struct x86_init_ops x86_init __initdata = { .resources = { .probe_roms = probe_roms, .reserve_resources = reserve_standard_io_resources, .memory_setup = e820__memory_setup_default, }, .mpparse = { .setup_ioapic_ids = x86_init_noop, .find_smp_config = default_find_smp_config, .get_smp_config = default_get_smp_config, }, .irqs = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, .intr_mode_select = apic_intr_mode_select, .intr_mode_init = apic_intr_mode_init, .create_pci_msi_domain = native_create_pci_msi_domain, }, .oem = { .arch_setup = x86_init_noop, .banner = default_banner, }, .paging = { .pagetable_init = native_pagetable_init, }, .timers = { .setup_percpu_clockev = setup_boot_APIC_clock, .timer_init = hpet_time_init, .wallclock_init = x86_wallclock_init, }, .iommu = { .iommu_init = iommu_init_noop, }, .pci = { .init = x86_default_pci_init, .init_irq = x86_default_pci_init_irq, .fixup_irqs = x86_default_pci_fixup_irqs, }, .hyper = { .init_platform = x86_init_noop, .guest_late_init = x86_init_noop, .x2apic_available = bool_x86_init_noop, .msi_ext_dest_id = bool_x86_init_noop, .init_mem_mapping = x86_init_noop, .init_after_bootmem = x86_init_noop, }, .acpi = { .set_root_pointer = x86_default_set_root_pointer, .get_root_pointer = x86_default_get_root_pointer, .reduced_hw_early_init = acpi_generic_reduced_hw_init, }, }; struct x86_cpuinit_ops x86_cpuinit = { .early_percpu_clock_init = x86_init_noop, .setup_percpu_clockev = setup_secondary_APIC_clock, .parallel_bringup = true, }; static void default_nmi_init(void) { }; static bool enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return true; } static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return true; } static bool enc_tlb_flush_required_noop(bool enc) { return false; } static bool enc_cache_flush_required_noop(void) { return false; } static bool is_private_mmio_noop(u64 addr) {return false; } struct x86_platform_ops x86_platform __ro_after_init = { .calibrate_cpu = native_calibrate_cpu_early, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_cmos_time, .iommu_shutdown = iommu_shutdown_noop, .is_untracked_pat_range = is_ISA_range, .nmi_init = default_nmi_init, .get_nmi_reason = default_get_nmi_reason, .save_sched_clock_state = tsc_save_sched_clock_state, .restore_sched_clock_state = tsc_restore_sched_clock_state, .realmode_reserve = reserve_real_mode, .realmode_init = init_real_mode, .hyper.pin_vcpu = x86_op_int_noop, .hyper.is_private_mmio = is_private_mmio_noop, .guest = { .enc_status_change_prepare = enc_status_change_prepare_noop, .enc_status_change_finish = enc_status_change_finish_noop, .enc_tlb_flush_required = enc_tlb_flush_required_noop, .enc_cache_flush_required = enc_cache_flush_required_noop, }, }; EXPORT_SYMBOL_GPL(x86_platform); struct x86_apic_ops x86_apic_ops __ro_after_init = { .io_apic_read = native_io_apic_read, .restore = native_restore_boot_irq_mode, };
1 2 20 1 21 28 3 30 35 5 25 7 21 1 21 18 4 7 14 29 29 29 29 9 21 29 29 332 4 347 9 9 9 9 24 2 24 55 55 55 42 11 2 2 1 8 10 7 6 4 7 5 2 2 4 10 3 1 21 27 26 20 14 5 15 14 27 46 46 2 14 30 13 2 1 2 8 43 2 6 36 32 2 1 3 2 8 16 6 1 1 1 3 1 3 1 2 2 3 21 21 21 21 9 9 2 3 9 3 3 21 1 2 1 1 1 5 2 7 3 1 6 11 14 14 14 7 1 6 17 14 11 17 9 13 12 20 17 28 17 14 17 17 9 31 41 40 41 39 36 5 3 45 32 17 22 12 22 4 31 4 28 6 42 1 2 1 34 35 3 38 2 37 29 8 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/hugetlb.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "openclose.h" #include "rsrc.h" struct io_rsrc_update { struct file *file; u64 arg; u32 nr_args; u32 offset; }; static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage); /* only define max */ #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) static const struct io_mapped_ubuf dummy_ubuf = { /* set invalid range, so io_import_fixed() fails meeting it */ .ubuf = -1UL, .ubuf_end = 0, }; int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; if (!nr_pages) return 0; /* Don't allow more pages than we can safely lock */ page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; cur_pages = atomic_long_read(&user->locked_vm); do { new_pages = cur_pages + nr_pages; if (new_pages > page_limit) return -ENOMEM; } while (!atomic_long_try_cmpxchg(&user->locked_vm, &cur_pages, new_pages)); return 0; } static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { if (ctx->user) __io_unaccount_mem(ctx->user, nr_pages); if (ctx->mm_account) atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); } static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { int ret; if (ctx->user) { ret = __io_account_mem(ctx->user, nr_pages); if (ret) return ret; } if (ctx->mm_account) atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); return 0; } static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, void __user *arg, unsigned index) { struct iovec __user *src; #ifdef CONFIG_COMPAT if (ctx->compat) { struct compat_iovec __user *ciovs; struct compat_iovec ciov; ciovs = (struct compat_iovec __user *) arg; if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) return -EFAULT; dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); dst->iov_len = ciov.iov_len; return 0; } #endif src = (struct iovec __user *) arg; if (copy_from_user(dst, &src[index], sizeof(*dst))) return -EFAULT; return 0; } static int io_buffer_validate(struct iovec *iov) { unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); /* * Don't impose further limits on the size and buffer * constraints here, we'll -EINVAL later when IO is * submitted if they are wrong. */ if (!iov->iov_base) return iov->iov_len ? -EFAULT : 0; if (!iov->iov_len) return -EFAULT; /* arbitrary limit, but we need something */ if (iov->iov_len > SZ_1G) return -EFAULT; if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) return -EOVERFLOW; return 0; } static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) { struct io_mapped_ubuf *imu = *slot; unsigned int i; if (imu != &dummy_ubuf) { for (i = 0; i < imu->nr_bvecs; i++) unpin_user_page(imu->bvec[i].bv_page); if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); kvfree(imu); } *slot = NULL; } static void io_rsrc_put_work(struct io_rsrc_node *node) { struct io_rsrc_put *prsrc = &node->item; if (prsrc->tag) io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0); switch (node->type) { case IORING_RSRC_FILE: fput(prsrc->file); break; case IORING_RSRC_BUFFER: io_rsrc_buf_put(node->ctx, prsrc); break; default: WARN_ON_ONCE(1); break; } } void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) kfree(node); } void io_rsrc_node_ref_zero(struct io_rsrc_node *node) __must_hold(&node->ctx->uring_lock) { struct io_ring_ctx *ctx = node->ctx; while (!list_empty(&ctx->rsrc_ref_list)) { node = list_first_entry(&ctx->rsrc_ref_list, struct io_rsrc_node, node); /* recycle ref nodes in order */ if (node->refs) break; list_del(&node->node); if (likely(!node->empty)) io_rsrc_put_work(node); io_rsrc_node_destroy(ctx, node); } if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) wake_up_all(&ctx->rsrc_quiesce_wq); } struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) { struct io_rsrc_node *ref_node; struct io_cache_entry *entry; entry = io_alloc_cache_get(&ctx->rsrc_node_cache); if (entry) { ref_node = container_of(entry, struct io_rsrc_node, cache); } else { ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); if (!ref_node) return NULL; } ref_node->ctx = ctx; ref_node->empty = 0; ref_node->refs = 1; return ref_node; } __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx) { struct io_rsrc_node *backup; DEFINE_WAIT(we); int ret; /* As We may drop ->uring_lock, other task may have started quiesce */ if (data->quiesce) return -ENXIO; backup = io_rsrc_node_alloc(ctx); if (!backup) return -ENOMEM; ctx->rsrc_node->empty = true; ctx->rsrc_node->type = -1; list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list); io_put_rsrc_node(ctx, ctx->rsrc_node); ctx->rsrc_node = backup; if (list_empty(&ctx->rsrc_ref_list)) return 0; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, 1); smp_mb(); } ctx->rsrc_quiesce++; data->quiesce = true; do { prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); mutex_unlock(&ctx->uring_lock); ret = io_run_task_work_sig(ctx); if (ret < 0) { mutex_lock(&ctx->uring_lock); if (list_empty(&ctx->rsrc_ref_list)) ret = 0; break; } schedule(); __set_current_state(TASK_RUNNING); mutex_lock(&ctx->uring_lock); ret = 0; } while (!list_empty(&ctx->rsrc_ref_list)); finish_wait(&ctx->rsrc_quiesce_wq, &we); data->quiesce = false; ctx->rsrc_quiesce--; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, 0); smp_mb(); } return ret; } static void io_free_page_table(void **table, size_t size) { unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); for (i = 0; i < nr_tables; i++) kfree(table[i]); kfree(table); } static void io_rsrc_data_free(struct io_rsrc_data *data) { size_t size = data->nr * sizeof(data->tags[0][0]); if (data->tags) io_free_page_table((void **)data->tags, size); kfree(data); } static __cold void **io_alloc_page_table(size_t size) { unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); size_t init_size = size; void **table; table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); if (!table) return NULL; for (i = 0; i < nr_tables; i++) { unsigned int this_size = min_t(size_t, size, PAGE_SIZE); table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); if (!table[i]) { io_free_page_table(table, init_size); return NULL; } size -= this_size; } return table; } __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type, u64 __user *utags, unsigned nr, struct io_rsrc_data **pdata) { struct io_rsrc_data *data; int ret = 0; unsigned i; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); if (!data->tags) { kfree(data); return -ENOMEM; } data->nr = nr; data->ctx = ctx; data->rsrc_type = type; if (utags) { ret = -EFAULT; for (i = 0; i < nr; i++) { u64 *tag_slot = io_get_tag_slot(data, i); if (copy_from_user(tag_slot, &utags[i], sizeof(*tag_slot))) goto fail; } } *pdata = data; return 0; fail: io_rsrc_data_free(data); return ret; } static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned nr_args) { u64 __user *tags = u64_to_user_ptr(up->tags); __s32 __user *fds = u64_to_user_ptr(up->data); struct io_rsrc_data *data = ctx->file_data; struct io_fixed_file *file_slot; int fd, i, err = 0; unsigned int done; if (!ctx->file_data) return -ENXIO; if (up->offset + nr_args > ctx->nr_user_files) return -EINVAL; for (done = 0; done < nr_args; done++) { u64 tag = 0; if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || copy_from_user(&fd, &fds[done], sizeof(fd))) { err = -EFAULT; break; } if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { err = -EINVAL; break; } if (fd == IORING_REGISTER_FILES_SKIP) continue; i = array_index_nospec(up->offset + done, ctx->nr_user_files); file_slot = io_fixed_file_slot(&ctx->file_table, i); if (file_slot->file_ptr) { err = io_queue_rsrc_removal(data, i, io_slot_file(file_slot)); if (err) break; file_slot->file_ptr = 0; io_file_bitmap_clear(&ctx->file_table, i); } if (fd != -1) { struct file *file = fget(fd); if (!file) { err = -EBADF; break; } /* * Don't allow io_uring instances to be registered. */ if (io_is_uring_fops(file)) { fput(file); err = -EBADF; break; } *io_get_tag_slot(data, i) = tag; io_fixed_file_set(file_slot, file); io_file_bitmap_set(&ctx->file_table, i); } } return done ? done : err; } static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned int nr_args) { u64 __user *tags = u64_to_user_ptr(up->tags); struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); struct page *last_hpage = NULL; __u32 done; int i, err; if (!ctx->buf_data) return -ENXIO; if (up->offset + nr_args > ctx->nr_user_bufs) return -EINVAL; for (done = 0; done < nr_args; done++) { struct io_mapped_ubuf *imu; u64 tag = 0; err = io_copy_iov(ctx, &iov, iovs, done); if (err) break; if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { err = -EFAULT; break; } err = io_buffer_validate(&iov); if (err) break; if (!iov.iov_base && tag) { err = -EINVAL; break; } err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); if (err) break; i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); if (ctx->user_bufs[i] != &dummy_ubuf) { err = io_queue_rsrc_removal(ctx->buf_data, i, ctx->user_bufs[i]); if (unlikely(err)) { io_buffer_unmap(ctx, &imu); break; } ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf; } ctx->user_bufs[i] = imu; *io_get_tag_slot(ctx->buf_data, i) = tag; } return done ? done : err; } static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, struct io_uring_rsrc_update2 *up, unsigned nr_args) { __u32 tmp; lockdep_assert_held(&ctx->uring_lock); if (check_add_overflow(up->offset, nr_args, &tmp)) return -EOVERFLOW; switch (type) { case IORING_RSRC_FILE: return __io_sqe_files_update(ctx, up, nr_args); case IORING_RSRC_BUFFER: return __io_sqe_buffers_update(ctx, up, nr_args); } return -EINVAL; } int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct io_uring_rsrc_update2 up; if (!nr_args) return -EINVAL; memset(&up, 0, sizeof(up)); if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) return -EFAULT; if (up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); } int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type) { struct io_uring_rsrc_update2 up; if (size != sizeof(up)) return -EINVAL; if (copy_from_user(&up, arg, sizeof(up))) return -EFAULT; if (!up.nr || up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, type, &up, up.nr); } __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type) { struct io_uring_rsrc_register rr; /* keep it extendible */ if (size != sizeof(rr)) return -EINVAL; memset(&rr, 0, sizeof(rr)); if (copy_from_user(&rr, arg, size)) return -EFAULT; if (!rr.nr || rr.resv2) return -EINVAL; if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) return -EINVAL; switch (type) { case IORING_RSRC_FILE: if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) break; return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); case IORING_RSRC_BUFFER: if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) break; return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); } return -EINVAL; } int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; if (sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; up->offset = READ_ONCE(sqe->off); up->nr_args = READ_ONCE(sqe->len); if (!up->nr_args) return -EINVAL; up->arg = READ_ONCE(sqe->addr); return 0; } static int io_files_update_with_index_alloc(struct io_kiocb *req, unsigned int issue_flags) { struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); __s32 __user *fds = u64_to_user_ptr(up->arg); unsigned int done; struct file *file; int ret, fd; if (!req->ctx->file_data) return -ENXIO; for (done = 0; done < up->nr_args; done++) { if (copy_from_user(&fd, &fds[done], sizeof(fd))) { ret = -EFAULT; break; } file = fget(fd); if (!file) { ret = -EBADF; break; } ret = io_fixed_fd_install(req, issue_flags, file, IORING_FILE_INDEX_ALLOC); if (ret < 0) break; if (copy_to_user(&fds[done], &ret, sizeof(ret))) { __io_close_fixed(req->ctx, issue_flags, ret); ret = -EFAULT; break; } } if (done) return done; return ret; } int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); struct io_ring_ctx *ctx = req->ctx; struct io_uring_rsrc_update2 up2; int ret; up2.offset = up->offset; up2.data = up->arg; up2.nr = 0; up2.tags = 0; up2.resv = 0; up2.resv2 = 0; if (up->offset == IORING_FILE_INDEX_ALLOC) { ret = io_files_update_with_index_alloc(req, issue_flags); } else { io_ring_submit_lock(ctx, issue_flags); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up2, up->nr_args); io_ring_submit_unlock(ctx, issue_flags); } if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; } int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) { struct io_ring_ctx *ctx = data->ctx; struct io_rsrc_node *node = ctx->rsrc_node; u64 *tag_slot = io_get_tag_slot(data, idx); ctx->rsrc_node = io_rsrc_node_alloc(ctx); if (unlikely(!ctx->rsrc_node)) { ctx->rsrc_node = node; return -ENOMEM; } node->item.rsrc = rsrc; node->type = data->rsrc_type; node->item.tag = *tag_slot; *tag_slot = 0; list_add_tail(&node->node, &ctx->rsrc_ref_list); io_put_rsrc_node(ctx, node); return 0; } void __io_sqe_files_unregister(struct io_ring_ctx *ctx) { int i; for (i = 0; i < ctx->nr_user_files; i++) { struct file *file = io_file_from_index(&ctx->file_table, i); if (!file) continue; io_file_bitmap_clear(&ctx->file_table, i); fput(file); } io_free_file_tables(&ctx->file_table); io_file_table_set_alloc_range(ctx, 0, 0); io_rsrc_data_free(ctx->file_data); ctx->file_data = NULL; ctx->nr_user_files = 0; } int io_sqe_files_unregister(struct io_ring_ctx *ctx) { unsigned nr = ctx->nr_user_files; int ret; if (!ctx->file_data) return -ENXIO; /* * Quiesce may unlock ->uring_lock, and while it's not held * prevent new requests using the table. */ ctx->nr_user_files = 0; ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); ctx->nr_user_files = nr; if (!ret) __io_sqe_files_unregister(ctx); return ret; } int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags) { __s32 __user *fds = (__s32 __user *) arg; struct file *file; int fd, ret; unsigned i; if (ctx->file_data) return -EBUSY; if (!nr_args) return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; if (nr_args > rlimit(RLIMIT_NOFILE)) return -EMFILE; ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args, &ctx->file_data); if (ret) return ret; if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { io_rsrc_data_free(ctx->file_data); ctx->file_data = NULL; return -ENOMEM; } for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { struct io_fixed_file *file_slot; if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { ret = -EFAULT; goto fail; } /* allow sparse sets */ if (!fds || fd == -1) { ret = -EINVAL; if (unlikely(*io_get_tag_slot(ctx->file_data, i))) goto fail; continue; } file = fget(fd); ret = -EBADF; if (unlikely(!file)) goto fail; /* * Don't allow io_uring instances to be registered. */ if (io_is_uring_fops(file)) { fput(file); goto fail; } file_slot = io_fixed_file_slot(&ctx->file_table, i); io_fixed_file_set(file_slot, file); io_file_bitmap_set(&ctx->file_table, i); } /* default it to the whole table */ io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); return 0; fail: __io_sqe_files_unregister(ctx); return ret; } static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) { io_buffer_unmap(ctx, &prsrc->buf); prsrc->buf = NULL; } void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) { unsigned int i; for (i = 0; i < ctx->nr_user_bufs; i++) io_buffer_unmap(ctx, &ctx->user_bufs[i]); kfree(ctx->user_bufs); io_rsrc_data_free(ctx->buf_data); ctx->user_bufs = NULL; ctx->buf_data = NULL; ctx->nr_user_bufs = 0; } int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) { unsigned nr = ctx->nr_user_bufs; int ret; if (!ctx->buf_data) return -ENXIO; /* * Quiesce may unlock ->uring_lock, and while it's not held * prevent new requests using the table. */ ctx->nr_user_bufs = 0; ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); ctx->nr_user_bufs = nr; if (!ret) __io_sqe_buffers_unregister(ctx); return ret; } /* * Not super efficient, but this is just a registration time. And we do cache * the last compound head, so generally we'll only do a full search if we don't * match that one. * * We check if the given compound head page has already been accounted, to * avoid double accounting it. This allows us to account the full size of the * page, not just the constituent pages of a huge page. */ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, int nr_pages, struct page *hpage) { int i, j; /* check current page array */ for (i = 0; i < nr_pages; i++) { if (!PageCompound(pages[i])) continue; if (compound_head(pages[i]) == hpage) return true; } /* check previously registered pages */ for (i = 0; i < ctx->nr_user_bufs; i++) { struct io_mapped_ubuf *imu = ctx->user_bufs[i]; for (j = 0; j < imu->nr_bvecs; j++) { if (!PageCompound(imu->bvec[j].bv_page)) continue; if (compound_head(imu->bvec[j].bv_page) == hpage) return true; } } return false; } static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, int nr_pages, struct io_mapped_ubuf *imu, struct page **last_hpage) { int i, ret; imu->acct_pages = 0; for (i = 0; i < nr_pages; i++) { if (!PageCompound(pages[i])) { imu->acct_pages++; } else { struct page *hpage; hpage = compound_head(pages[i]); if (hpage == *last_hpage) continue; *last_hpage = hpage; if (headpage_already_acct(ctx, pages, i, hpage)) continue; imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; } } if (!imu->acct_pages) return 0; ret = io_account_mem(ctx, imu->acct_pages); if (ret) imu->acct_pages = 0; return ret; } struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) { unsigned long start, end, nr_pages; struct page **pages = NULL; int ret; end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = ubuf >> PAGE_SHIFT; nr_pages = end - start; WARN_ON(!nr_pages); pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) return ERR_PTR(-ENOMEM); mmap_read_lock(current->mm); ret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages); mmap_read_unlock(current->mm); /* success, mapped all pages */ if (ret == nr_pages) { *npages = nr_pages; return pages; } /* partial map, or didn't map anything */ if (ret >= 0) { /* if we did partial map, release any pages we did get */ if (ret) unpin_user_pages(pages, ret); ret = -EFAULT; } kvfree(pages); return ERR_PTR(ret); } static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage) { struct io_mapped_ubuf *imu = NULL; struct page **pages = NULL; unsigned long off; size_t size; int ret, nr_pages, i; struct folio *folio = NULL; *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; if (!iov->iov_base) return 0; ret = -ENOMEM; pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, &nr_pages); if (IS_ERR(pages)) { ret = PTR_ERR(pages); pages = NULL; goto done; } /* If it's a huge page, try to coalesce them into a single bvec entry */ if (nr_pages > 1) { folio = page_folio(pages[0]); for (i = 1; i < nr_pages; i++) { /* * Pages must be consecutive and on the same folio for * this to work */ if (page_folio(pages[i]) != folio || pages[i] != pages[i - 1] + 1) { folio = NULL; break; } } if (folio) { /* * The pages are bound to the folio, it doesn't * actually unpin them but drops all but one reference, * which is usually put down by io_buffer_unmap(). * Note, needs a better helper. */ unpin_user_pages(&pages[1], nr_pages - 1); nr_pages = 1; } } imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); if (!imu) goto done; ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); if (ret) { unpin_user_pages(pages, nr_pages); goto done; } off = (unsigned long) iov->iov_base & ~PAGE_MASK; size = iov->iov_len; /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; imu->ubuf_end = imu->ubuf + iov->iov_len; imu->nr_bvecs = nr_pages; *pimu = imu; ret = 0; if (folio) { bvec_set_page(&imu->bvec[0], pages[0], size, off); goto done; } for (i = 0; i < nr_pages; i++) { size_t vec_len; vec_len = min_t(size_t, size, PAGE_SIZE - off); bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); off = 0; size -= vec_len; } done: if (ret) kvfree(imu); kvfree(pages); return ret; } static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) { ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); return ctx->user_bufs ? 0 : -ENOMEM; } int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args, u64 __user *tags) { struct page *last_hpage = NULL; struct io_rsrc_data *data; int i, ret; struct iovec iov; BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); if (ctx->user_bufs) return -EBUSY; if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data); if (ret) return ret; ret = io_buffers_map_alloc(ctx, nr_args); if (ret) { io_rsrc_data_free(data); return ret; } for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { if (arg) { ret = io_copy_iov(ctx, &iov, arg, i); if (ret) break; ret = io_buffer_validate(&iov); if (ret) break; } else { memset(&iov, 0, sizeof(iov)); } if (!iov.iov_base && *io_get_tag_slot(data, i)) { ret = -EINVAL; break; } ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], &last_hpage); if (ret) break; } WARN_ON_ONCE(ctx->buf_data); ctx->buf_data = data; if (ret) __io_sqe_buffers_unregister(ctx); return ret; } int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len) { u64 buf_end; size_t offset; if (WARN_ON_ONCE(!imu)) return -EFAULT; if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) return -EFAULT; /* * Might not be a start of buffer, set size appropriately * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); if (offset) { /* * Don't use iov_iter_advance() here, as it's really slow for * using the latter parts of a big fixed buffer - it iterates * over each segment manually. We can cheat a bit here, because * we know that: * * 1) it's a BVEC iter, we set it up * 2) all bvecs are PAGE_SIZE in size, except potentially the * first and last bvec * * So just find our index, and adjust the iterator afterwards. * If the offset is within the first bvec (or the whole first * bvec, just use iov_iter_advance(). This makes it easier * since we can just skip the first segment, which may not * be PAGE_SIZE aligned. */ const struct bio_vec *bvec = imu->bvec; if (offset < bvec->bv_len) { /* * Note, huge pages buffers consists of one large * bvec entry and should always go this way. The other * branch doesn't expect non PAGE_SIZE'd chunks. */ iter->bvec = bvec; iter->nr_segs = bvec->bv_len; iter->count -= offset; iter->iov_offset = offset; } else { unsigned long seg_skip; /* skip first vec */ offset -= bvec->bv_len; seg_skip = 1 + (offset >> PAGE_SHIFT); iter->bvec = bvec + seg_skip; iter->nr_segs -= seg_skip; iter->count -= bvec->bv_len + offset; iter->iov_offset = offset & ~PAGE_MASK; } } return 0; }
13 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 #ifndef __DRM_GEM_H__ #define __DRM_GEM_H__ /* * GEM Graphics Execution Manager Driver Interfaces * * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * Copyright (c) 2009-2010, Code Aurora Forum. * All rights reserved. * Copyright © 2014 Intel Corporation * Daniel Vetter <daniel.vetter@ffwll.ch> * * Author: Rickard E. (Rik) Faith <faith@valinux.com> * Author: Gareth Hughes <gareth@valinux.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <linux/kref.h> #include <linux/dma-resv.h> #include <linux/list.h> #include <linux/mutex.h> #include <drm/drm_vma_manager.h> struct iosys_map; struct drm_gem_object; /** * enum drm_gem_object_status - bitmask of object state for fdinfo reporting * @DRM_GEM_OBJECT_RESIDENT: object is resident in memory (ie. not unpinned) * @DRM_GEM_OBJECT_PURGEABLE: object marked as purgeable by userspace * * Bitmask of status used for fdinfo memory stats, see &drm_gem_object_funcs.status * and drm_show_fdinfo(). Note that an object can DRM_GEM_OBJECT_PURGEABLE if * it still active or not resident, in which case drm_show_fdinfo() will not * account for it as purgeable. So drivers do not need to check if the buffer * is idle and resident to return this bit. (Ie. userspace can mark a buffer * as purgeable even while it is still busy on the GPU.. it does not _actually_ * become puregeable until it becomes idle. The status gem object func does * not need to consider this.) */ enum drm_gem_object_status { DRM_GEM_OBJECT_RESIDENT = BIT(0), DRM_GEM_OBJECT_PURGEABLE = BIT(1), }; /** * struct drm_gem_object_funcs - GEM object functions */ struct drm_gem_object_funcs { /** * @free: * * Deconstructor for drm_gem_objects. * * This callback is mandatory. */ void (*free)(struct drm_gem_object *obj); /** * @open: * * Called upon GEM handle creation. * * This callback is optional. */ int (*open)(struct drm_gem_object *obj, struct drm_file *file); /** * @close: * * Called upon GEM handle release. * * This callback is optional. */ void (*close)(struct drm_gem_object *obj, struct drm_file *file); /** * @print_info: * * If driver subclasses struct &drm_gem_object, it can implement this * optional hook for printing additional driver specific info. * * drm_printf_indent() should be used in the callback passing it the * indent argument. * * This callback is called from drm_gem_print_info(). * * This callback is optional. */ void (*print_info)(struct drm_printer *p, unsigned int indent, const struct drm_gem_object *obj); /** * @export: * * Export backing buffer as a &dma_buf. * If this is not set drm_gem_prime_export() is used. * * This callback is optional. */ struct dma_buf *(*export)(struct drm_gem_object *obj, int flags); /** * @pin: * * Pin backing buffer in memory. Used by the drm_gem_map_attach() helper. * * This callback is optional. */ int (*pin)(struct drm_gem_object *obj); /** * @unpin: * * Unpin backing buffer. Used by the drm_gem_map_detach() helper. * * This callback is optional. */ void (*unpin)(struct drm_gem_object *obj); /** * @get_sg_table: * * Returns a Scatter-Gather table representation of the buffer. * Used when exporting a buffer by the drm_gem_map_dma_buf() helper. * Releasing is done by calling dma_unmap_sg_attrs() and sg_free_table() * in drm_gem_unmap_buf(), therefore these helpers and this callback * here cannot be used for sg tables pointing at driver private memory * ranges. * * See also drm_prime_pages_to_sg(). */ struct sg_table *(*get_sg_table)(struct drm_gem_object *obj); /** * @vmap: * * Returns a virtual address for the buffer. Used by the * drm_gem_dmabuf_vmap() helper. * * This callback is optional. */ int (*vmap)(struct drm_gem_object *obj, struct iosys_map *map); /** * @vunmap: * * Releases the address previously returned by @vmap. Used by the * drm_gem_dmabuf_vunmap() helper. * * This callback is optional. */ void (*vunmap)(struct drm_gem_object *obj, struct iosys_map *map); /** * @mmap: * * Handle mmap() of the gem object, setup vma accordingly. * * This callback is optional. * * The callback is used by both drm_gem_mmap_obj() and * drm_gem_prime_mmap(). When @mmap is present @vm_ops is not * used, the @mmap callback must set vma->vm_ops instead. */ int (*mmap)(struct drm_gem_object *obj, struct vm_area_struct *vma); /** * @evict: * * Evicts gem object out from memory. Used by the drm_gem_object_evict() * helper. Returns 0 on success, -errno otherwise. * * This callback is optional. */ int (*evict)(struct drm_gem_object *obj); /** * @status: * * The optional status callback can return additional object state * which determines which stats the object is counted against. The * callback is called under table_lock. Racing against object status * change is "harmless", and the callback can expect to not race * against object destruction. * * Called by drm_show_memory_stats(). */ enum drm_gem_object_status (*status)(struct drm_gem_object *obj); /** * @rss: * * Return resident size of the object in physical memory. * * Called by drm_show_memory_stats(). */ size_t (*rss)(struct drm_gem_object *obj); /** * @vm_ops: * * Virtual memory operations used with mmap. * * This is optional but necessary for mmap support. */ const struct vm_operations_struct *vm_ops; }; /** * struct drm_gem_lru - A simple LRU helper * * A helper for tracking GEM objects in a given state, to aid in * driver's shrinker implementation. Tracks the count of pages * for lockless &shrinker.count_objects, and provides * &drm_gem_lru_scan for driver's &shrinker.scan_objects * implementation. */ struct drm_gem_lru { /** * @lock: * * Lock protecting movement of GEM objects between LRUs. All * LRUs that the object can move between should be protected * by the same lock. */ struct mutex *lock; /** * @count: * * The total number of backing pages of the GEM objects in * this LRU. */ long count; /** * @list: * * The LRU list. */ struct list_head list; }; /** * struct drm_gem_object - GEM buffer object * * This structure defines the generic parts for GEM buffer objects, which are * mostly around handling mmap and userspace handles. * * Buffer objects are often abbreviated to BO. */ struct drm_gem_object { /** * @refcount: * * Reference count of this object * * Please use drm_gem_object_get() to acquire and drm_gem_object_put_locked() * or drm_gem_object_put() to release a reference to a GEM * buffer object. */ struct kref refcount; /** * @handle_count: * * This is the GEM file_priv handle count of this object. * * Each handle also holds a reference. Note that when the handle_count * drops to 0 any global names (e.g. the id in the flink namespace) will * be cleared. * * Protected by &drm_device.object_name_lock. */ unsigned handle_count; /** * @dev: DRM dev this object belongs to. */ struct drm_device *dev; /** * @filp: * * SHMEM file node used as backing storage for swappable buffer objects. * GEM also supports driver private objects with driver-specific backing * storage (contiguous DMA memory, special reserved blocks). In this * case @filp is NULL. */ struct file *filp; /** * @vma_node: * * Mapping info for this object to support mmap. Drivers are supposed to * allocate the mmap offset using drm_gem_create_mmap_offset(). The * offset itself can be retrieved using drm_vma_node_offset_addr(). * * Memory mapping itself is handled by drm_gem_mmap(), which also checks * that userspace is allowed to access the object. */ struct drm_vma_offset_node vma_node; /** * @size: * * Size of the object, in bytes. Immutable over the object's * lifetime. */ size_t size; /** * @name: * * Global name for this object, starts at 1. 0 means unnamed. * Access is covered by &drm_device.object_name_lock. This is used by * the GEM_FLINK and GEM_OPEN ioctls. */ int name; /** * @dma_buf: * * dma-buf associated with this GEM object. * * Pointer to the dma-buf associated with this gem object (either * through importing or exporting). We break the resulting reference * loop when the last gem handle for this object is released. * * Protected by &drm_device.object_name_lock. */ struct dma_buf *dma_buf; /** * @import_attach: * * dma-buf attachment backing this object. * * Any foreign dma_buf imported as a gem object has this set to the * attachment point for the device. This is invariant over the lifetime * of a gem object. * * The &drm_gem_object_funcs.free callback is responsible for * cleaning up the dma_buf attachment and references acquired at import * time. * * Note that the drm gem/prime core does not depend upon drivers setting * this field any more. So for drivers where this doesn't make sense * (e.g. virtual devices or a displaylink behind an usb bus) they can * simply leave it as NULL. */ struct dma_buf_attachment *import_attach; /** * @resv: * * Pointer to reservation object associated with the this GEM object. * * Normally (@resv == &@_resv) except for imported GEM objects. */ struct dma_resv *resv; /** * @_resv: * * A reservation object for this GEM object. * * This is unused for imported GEM objects. */ struct dma_resv _resv; /** * @gpuva: * * Provides the list of GPU VAs attached to this GEM object. * * Drivers should lock list accesses with the GEMs &dma_resv lock * (&drm_gem_object.resv) or a custom lock if one is provided. */ struct { struct list_head list; #ifdef CONFIG_LOCKDEP struct lockdep_map *lock_dep_map; #endif } gpuva; /** * @funcs: * * Optional GEM object functions. If this is set, it will be used instead of the * corresponding &drm_driver GEM callbacks. * * New drivers should use this. * */ const struct drm_gem_object_funcs *funcs; /** * @lru_node: * * List node in a &drm_gem_lru. */ struct list_head lru_node; /** * @lru: * * The current LRU list that the GEM object is on. */ struct drm_gem_lru *lru; }; /** * DRM_GEM_FOPS - Default drm GEM file operations * * This macro provides a shorthand for setting the GEM file ops in the * &file_operations structure. If all you need are the default ops, use * DEFINE_DRM_GEM_FOPS instead. */ #define DRM_GEM_FOPS \ .open = drm_open,\ .release = drm_release,\ .unlocked_ioctl = drm_ioctl,\ .compat_ioctl = drm_compat_ioctl,\ .poll = drm_poll,\ .read = drm_read,\ .llseek = noop_llseek,\ .mmap = drm_gem_mmap /** * DEFINE_DRM_GEM_FOPS() - macro to generate file operations for GEM drivers * @name: name for the generated structure * * This macro autogenerates a suitable &struct file_operations for GEM based * drivers, which can be assigned to &drm_driver.fops. Note that this structure * cannot be shared between drivers, because it contains a reference to the * current module using THIS_MODULE. * * Note that the declaration is already marked as static - if you need a * non-static version of this you're probably doing it wrong and will break the * THIS_MODULE reference by accident. */ #define DEFINE_DRM_GEM_FOPS(name) \ static const struct file_operations name = {\ .owner = THIS_MODULE,\ DRM_GEM_FOPS,\ } void drm_gem_object_release(struct drm_gem_object *obj); void drm_gem_object_free(struct kref *kref); int drm_gem_object_init(struct drm_device *dev, struct drm_gem_object *obj, size_t size); void drm_gem_private_object_init(struct drm_device *dev, struct drm_gem_object *obj, size_t size); void drm_gem_private_object_fini(struct drm_gem_object *obj); void drm_gem_vm_open(struct vm_area_struct *vma); void drm_gem_vm_close(struct vm_area_struct *vma); int drm_gem_mmap_obj(struct drm_gem_object *obj, unsigned long obj_size, struct vm_area_struct *vma); int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma); /** * drm_gem_object_get - acquire a GEM buffer object reference * @obj: GEM buffer object * * This function acquires an additional reference to @obj. It is illegal to * call this without already holding a reference. No locks required. */ static inline void drm_gem_object_get(struct drm_gem_object *obj) { kref_get(&obj->refcount); } __attribute__((nonnull)) static inline void __drm_gem_object_put(struct drm_gem_object *obj) { kref_put(&obj->refcount, drm_gem_object_free); } /** * drm_gem_object_put - drop a GEM buffer object reference * @obj: GEM buffer object * * This releases a reference to @obj. */ static inline void drm_gem_object_put(struct drm_gem_object *obj) { if (obj) __drm_gem_object_put(obj); } int drm_gem_handle_create(struct drm_file *file_priv, struct drm_gem_object *obj, u32 *handlep); int drm_gem_handle_delete(struct drm_file *filp, u32 handle); void drm_gem_free_mmap_offset(struct drm_gem_object *obj); int drm_gem_create_mmap_offset(struct drm_gem_object *obj); int drm_gem_create_mmap_offset_size(struct drm_gem_object *obj, size_t size); struct page **drm_gem_get_pages(struct drm_gem_object *obj); void drm_gem_put_pages(struct drm_gem_object *obj, struct page **pages, bool dirty, bool accessed); int drm_gem_vmap_unlocked(struct drm_gem_object *obj, struct iosys_map *map); void drm_gem_vunmap_unlocked(struct drm_gem_object *obj, struct iosys_map *map); int drm_gem_objects_lookup(struct drm_file *filp, void __user *bo_handles, int count, struct drm_gem_object ***objs_out); struct drm_gem_object *drm_gem_object_lookup(struct drm_file *filp, u32 handle); long drm_gem_dma_resv_wait(struct drm_file *filep, u32 handle, bool wait_all, unsigned long timeout); int drm_gem_lock_reservations(struct drm_gem_object **objs, int count, struct ww_acquire_ctx *acquire_ctx); void drm_gem_unlock_reservations(struct drm_gem_object **objs, int count, struct ww_acquire_ctx *acquire_ctx); int drm_gem_dumb_map_offset(struct drm_file *file, struct drm_device *dev, u32 handle, u64 *offset); void drm_gem_lru_init(struct drm_gem_lru *lru, struct mutex *lock); void drm_gem_lru_remove(struct drm_gem_object *obj); void drm_gem_lru_move_tail_locked(struct drm_gem_lru *lru, struct drm_gem_object *obj); void drm_gem_lru_move_tail(struct drm_gem_lru *lru, struct drm_gem_object *obj); unsigned long drm_gem_lru_scan(struct drm_gem_lru *lru, unsigned int nr_to_scan, unsigned long *remaining, bool (*shrink)(struct drm_gem_object *obj)); int drm_gem_evict(struct drm_gem_object *obj); #ifdef CONFIG_LOCKDEP /** * drm_gem_gpuva_set_lock() - Set the lock protecting accesses to the gpuva list. * @obj: the &drm_gem_object * @lock: the lock used to protect the gpuva list. The locking primitive * must contain a dep_map field. * * Call this if you're not proctecting access to the gpuva list with the * dma-resv lock, but with a custom lock. */ #define drm_gem_gpuva_set_lock(obj, lock) \ if (!WARN((obj)->gpuva.lock_dep_map, \ "GEM GPUVA lock should be set only once.")) \ (obj)->gpuva.lock_dep_map = &(lock)->dep_map #define drm_gem_gpuva_assert_lock_held(obj) \ lockdep_assert((obj)->gpuva.lock_dep_map ? \ lock_is_held((obj)->gpuva.lock_dep_map) : \ dma_resv_held((obj)->resv)) #else #define drm_gem_gpuva_set_lock(obj, lock) do {} while (0) #define drm_gem_gpuva_assert_lock_held(obj) do {} while (0) #endif /** * drm_gem_gpuva_init() - initialize the gpuva list of a GEM object * @obj: the &drm_gem_object * * This initializes the &drm_gem_object's &drm_gpuvm_bo list. * * Calling this function is only necessary for drivers intending to support the * &drm_driver_feature DRIVER_GEM_GPUVA. * * See also drm_gem_gpuva_set_lock(). */ static inline void drm_gem_gpuva_init(struct drm_gem_object *obj) { INIT_LIST_HEAD(&obj->gpuva.list); } /** * drm_gem_for_each_gpuvm_bo() - iterator to walk over a list of &drm_gpuvm_bo * @entry__: &drm_gpuvm_bo structure to assign to in each iteration step * @obj__: the &drm_gem_object the &drm_gpuvm_bo to walk are associated with * * This iterator walks over all &drm_gpuvm_bo structures associated with the * &drm_gem_object. */ #define drm_gem_for_each_gpuvm_bo(entry__, obj__) \ list_for_each_entry(entry__, &(obj__)->gpuva.list, list.entry.gem) /** * drm_gem_for_each_gpuvm_bo_safe() - iterator to safely walk over a list of * &drm_gpuvm_bo * @entry__: &drm_gpuvm_bostructure to assign to in each iteration step * @next__: &next &drm_gpuvm_bo to store the next step * @obj__: the &drm_gem_object the &drm_gpuvm_bo to walk are associated with * * This iterator walks over all &drm_gpuvm_bo structures associated with the * &drm_gem_object. It is implemented with list_for_each_entry_safe(), hence * it is save against removal of elements. */ #define drm_gem_for_each_gpuvm_bo_safe(entry__, next__, obj__) \ list_for_each_entry_safe(entry__, next__, &(obj__)->gpuva.list, list.entry.gem) #endif /* __DRM_GEM_H__ */
13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Gao Xiang <xiang@kernel.org> * * For low-latency decompression algorithms (e.g. lz4), reserve consecutive * per-CPU virtual memory (in pages) in advance to store such inplace I/O * data if inplace decompression is failed (due to unmet inplace margin for * example). */ #include "internal.h" struct erofs_pcpubuf { raw_spinlock_t lock; void *ptr; struct page **pages; unsigned int nrpages; }; static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb); void *erofs_get_pcpubuf(unsigned int requiredpages) __acquires(pcb->lock) { struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb); raw_spin_lock(&pcb->lock); /* check if the per-CPU buffer is too small */ if (requiredpages > pcb->nrpages) { raw_spin_unlock(&pcb->lock); put_cpu_var(erofs_pcb); /* (for sparse checker) pretend pcb->lock is still taken */ __acquire(pcb->lock); return NULL; } return pcb->ptr; } void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock) { struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id()); DBG_BUGON(pcb->ptr != ptr); raw_spin_unlock(&pcb->lock); put_cpu_var(erofs_pcb); } /* the next step: support per-CPU page buffers hotplug */ int erofs_pcpubuf_growsize(unsigned int nrpages) { static DEFINE_MUTEX(pcb_resize_mutex); static unsigned int pcb_nrpages; struct page *pagepool = NULL; int delta, cpu, ret, i; mutex_lock(&pcb_resize_mutex); delta = nrpages - pcb_nrpages; ret = 0; /* avoid shrinking pcpubuf, since no idea how many fses rely on */ if (delta <= 0) goto out; for_each_possible_cpu(cpu) { struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); struct page **pages, **oldpages; void *ptr, *old_ptr; pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL); if (!pages) { ret = -ENOMEM; break; } for (i = 0; i < nrpages; ++i) { pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL); if (!pages[i]) { ret = -ENOMEM; oldpages = pages; goto free_pagearray; } } ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL); if (!ptr) { ret = -ENOMEM; oldpages = pages; goto free_pagearray; } raw_spin_lock(&pcb->lock); old_ptr = pcb->ptr; pcb->ptr = ptr; oldpages = pcb->pages; pcb->pages = pages; i = pcb->nrpages; pcb->nrpages = nrpages; raw_spin_unlock(&pcb->lock); if (!oldpages) { DBG_BUGON(old_ptr); continue; } if (old_ptr) vunmap(old_ptr); free_pagearray: while (i) erofs_pagepool_add(&pagepool, oldpages[--i]); kfree(oldpages); if (ret) break; } pcb_nrpages = nrpages; erofs_release_pages(&pagepool); out: mutex_unlock(&pcb_resize_mutex); return ret; } void __init erofs_pcpubuf_init(void) { int cpu; for_each_possible_cpu(cpu) { struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); raw_spin_lock_init(&pcb->lock); } } void erofs_pcpubuf_exit(void) { int cpu, i; for_each_possible_cpu(cpu) { struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu); if (pcb->ptr) { vunmap(pcb->ptr); pcb->ptr = NULL; } if (!pcb->pages) continue; for (i = 0; i < pcb->nrpages; ++i) if (pcb->pages[i]) put_page(pcb->pages[i]); kfree(pcb->pages); pcb->pages = NULL; } }
1699 1698 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 // SPDX-License-Identifier: GPL-2.0+ /* * IMA support for appraising module-style appended signatures. * * Copyright (C) 2019 IBM Corporation * * Author: * Thiago Jung Bauermann <bauerman@linux.ibm.com> */ #include <linux/types.h> #include <linux/module_signature.h> #include <keys/asymmetric-type.h> #include <crypto/pkcs7.h> #include "ima.h" struct modsig { struct pkcs7_message *pkcs7_msg; enum hash_algo hash_algo; /* This digest will go in the 'd-modsig' field of the IMA template. */ const u8 *digest; u32 digest_size; /* * This is what will go to the measurement list if the template requires * storing the signature. */ int raw_pkcs7_len; u8 raw_pkcs7[] __counted_by(raw_pkcs7_len); }; /* * ima_read_modsig - Read modsig from buf. * * Return: 0 on success, error code otherwise. */ int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len, struct modsig **modsig) { const size_t marker_len = strlen(MODULE_SIG_STRING); const struct module_signature *sig; struct modsig *hdr; size_t sig_len; const void *p; int rc; if (buf_len <= marker_len + sizeof(*sig)) return -ENOENT; p = buf + buf_len - marker_len; if (memcmp(p, MODULE_SIG_STRING, marker_len)) return -ENOENT; buf_len -= marker_len; sig = (const struct module_signature *)(p - sizeof(*sig)); rc = mod_check_sig(sig, buf_len, func_tokens[func]); if (rc) return rc; sig_len = be32_to_cpu(sig->sig_len); buf_len -= sig_len + sizeof(*sig); /* Allocate sig_len additional bytes to hold the raw PKCS#7 data. */ hdr = kzalloc(struct_size(hdr, raw_pkcs7, sig_len), GFP_KERNEL); if (!hdr) return -ENOMEM; hdr->raw_pkcs7_len = sig_len; hdr->pkcs7_msg = pkcs7_parse_message(buf + buf_len, sig_len); if (IS_ERR(hdr->pkcs7_msg)) { rc = PTR_ERR(hdr->pkcs7_msg); kfree(hdr); return rc; } memcpy(hdr->raw_pkcs7, buf + buf_len, sig_len); /* We don't know the hash algorithm yet. */ hdr->hash_algo = HASH_ALGO__LAST; *modsig = hdr; return 0; } /** * ima_collect_modsig - Calculate the file hash without the appended signature. * @modsig: parsed module signature * @buf: data to verify the signature on * @size: data size * * Since the modsig is part of the file contents, the hash used in its signature * isn't the same one ordinarily calculated by IMA. Therefore PKCS7 code * calculates a separate one for signature verification. */ void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size) { int rc; /* * Provide the file contents (minus the appended sig) so that the PKCS7 * code can calculate the file hash. */ size -= modsig->raw_pkcs7_len + strlen(MODULE_SIG_STRING) + sizeof(struct module_signature); rc = pkcs7_supply_detached_data(modsig->pkcs7_msg, buf, size); if (rc) return; /* Ask the PKCS7 code to calculate the file hash. */ rc = pkcs7_get_digest(modsig->pkcs7_msg, &modsig->digest, &modsig->digest_size, &modsig->hash_algo); } int ima_modsig_verify(struct key *keyring, const struct modsig *modsig) { return verify_pkcs7_message_sig(NULL, 0, modsig->pkcs7_msg, keyring, VERIFYING_MODULE_SIGNATURE, NULL, NULL); } int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo, const u8 **digest, u32 *digest_size) { *algo = modsig->hash_algo; *digest = modsig->digest; *digest_size = modsig->digest_size; return 0; } int ima_get_raw_modsig(const struct modsig *modsig, const void **data, u32 *data_len) { *data = &modsig->raw_pkcs7; *data_len = modsig->raw_pkcs7_len; return 0; } void ima_free_modsig(struct modsig *modsig) { if (!modsig) return; pkcs7_free_message(modsig->pkcs7_msg); kfree(modsig); }
97 97 11 31 32 90 25 25 11 18 25 25 25 118 118 4 4 4 82 12 12 12 8 8 8 8 8 2 8 29 63 12 12 658 310 357 66 357 269 611 82 658 263 77 186 659 9 266 59 375 701 701 701 701 700 687 700 687 88 697 617 612 612 232 119 266 659 6 6 97 86 84 84 84 2 97 219 13 656 12 617 617 617 44 44 14 42 278 92 263 612 118 4 84 9 25 8 82 12 1 2 25 21 122 84 375 375 82 82 698 1 122 699 688 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * * This file is part of the SCTP kernel implementation * * These functions work with the state functions in sctp_sm_statefuns.c * to implement that state operations. These functions implement the * steps which require modifying existing data structures. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@austin.ibm.com> * Hui Huang <hui.huang@nokia.com> * Dajiang Zhang <dajiang.zhang@nokia.com> * Daisy Chang <daisyc@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/skbuff.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/ip.h> #include <linux/gfp.h> #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> static int sctp_cmd_interpreter(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association *asoc, void *event_arg, enum sctp_disposition status, struct sctp_cmd_seq *commands, gfp_t gfp); static int sctp_side_effects(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association **asoc, void *event_arg, enum sctp_disposition status, struct sctp_cmd_seq *commands, gfp_t gfp); /******************************************************************** * Helper functions ********************************************************************/ /* A helper function for delayed processing of INET ECN CE bit. */ static void sctp_do_ecn_ce_work(struct sctp_association *asoc, __u32 lowest_tsn) { /* Save the TSN away for comparison when we receive CWR */ asoc->last_ecne_tsn = lowest_tsn; asoc->need_ecne = 1; } /* Helper function for delayed processing of SCTP ECNE chunk. */ /* RFC 2960 Appendix A * * RFC 2481 details a specific bit for a sender to send in * the header of its next outbound TCP segment to indicate to * its peer that it has reduced its congestion window. This * is termed the CWR bit. For SCTP the same indication is made * by including the CWR chunk. This chunk contains one data * element, i.e. the TSN number that was sent in the ECNE chunk. * This element represents the lowest TSN number in the datagram * that was originally marked with the CE bit. */ static struct sctp_chunk *sctp_do_ecn_ecne_work(struct sctp_association *asoc, __u32 lowest_tsn, struct sctp_chunk *chunk) { struct sctp_chunk *repl; /* Our previously transmitted packet ran into some congestion * so we should take action by reducing cwnd and ssthresh * and then ACK our peer that we we've done so by * sending a CWR. */ /* First, try to determine if we want to actually lower * our cwnd variables. Only lower them if the ECNE looks more * recent than the last response. */ if (TSN_lt(asoc->last_cwr_tsn, lowest_tsn)) { struct sctp_transport *transport; /* Find which transport's congestion variables * need to be adjusted. */ transport = sctp_assoc_lookup_tsn(asoc, lowest_tsn); /* Update the congestion variables. */ if (transport) sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_ECNE); asoc->last_cwr_tsn = lowest_tsn; } /* Always try to quiet the other end. In case of lost CWR, * resend last_cwr_tsn. */ repl = sctp_make_cwr(asoc, asoc->last_cwr_tsn, chunk); /* If we run out of memory, it will look like a lost CWR. We'll * get back in sync eventually. */ return repl; } /* Helper function to do delayed processing of ECN CWR chunk. */ static void sctp_do_ecn_cwr_work(struct sctp_association *asoc, __u32 lowest_tsn) { /* Turn off ECNE getting auto-prepended to every outgoing * packet */ asoc->need_ecne = 0; } /* Generate SACK if necessary. We call this at the end of a packet. */ static int sctp_gen_sack(struct sctp_association *asoc, int force, struct sctp_cmd_seq *commands) { struct sctp_transport *trans = asoc->peer.last_data_from; __u32 ctsn, max_tsn_seen; struct sctp_chunk *sack; int error = 0; if (force || (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) || (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE))) asoc->peer.sack_needed = 1; ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map); /* From 12.2 Parameters necessary per association (i.e. the TCB): * * Ack State : This flag indicates if the next received packet * : is to be responded to with a SACK. ... * : When DATA chunks are out of order, SACK's * : are not delayed (see Section 6). * * [This is actually not mentioned in Section 6, but we * implement it here anyway. --piggy] */ if (max_tsn_seen != ctsn) asoc->peer.sack_needed = 1; /* From 6.2 Acknowledgement on Reception of DATA Chunks: * * Section 4.2 of [RFC2581] SHOULD be followed. Specifically, * an acknowledgement SHOULD be generated for at least every * second packet (not every second DATA chunk) received, and * SHOULD be generated within 200 ms of the arrival of any * unacknowledged DATA chunk. ... */ if (!asoc->peer.sack_needed) { asoc->peer.sack_cnt++; /* Set the SACK delay timeout based on the * SACK delay for the last transport * data was received from, or the default * for the association. */ if (trans) { /* We will need a SACK for the next packet. */ if (asoc->peer.sack_cnt >= trans->sackfreq - 1) asoc->peer.sack_needed = 1; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = trans->sackdelay; } else { /* We will need a SACK for the next packet. */ if (asoc->peer.sack_cnt >= asoc->sackfreq - 1) asoc->peer.sack_needed = 1; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; } /* Restart the SACK timer. */ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } else { __u32 old_a_rwnd = asoc->a_rwnd; asoc->a_rwnd = asoc->rwnd; sack = sctp_make_sack(asoc); if (!sack) { asoc->a_rwnd = old_a_rwnd; goto nomem; } asoc->peer.sack_needed = 0; asoc->peer.sack_cnt = 0; sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(sack)); /* Stop the SACK timer. */ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } return error; nomem: error = -ENOMEM; return error; } /* When the T3-RTX timer expires, it calls this function to create the * relevant state machine event. */ void sctp_generate_t3_rtx_event(struct timer_list *t) { struct sctp_transport *transport = from_timer(transport, t, T3_rtx_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); int error; /* Check whether a task is in the sock. */ bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ if (!mod_timer(&transport->T3_rtx_timer, jiffies + (HZ/20))) sctp_transport_hold(transport); goto out_unlock; } /* Run through the state machine. */ error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_T3_RTX), asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); if (error) sk->sk_err = -error; out_unlock: bh_unlock_sock(sk); sctp_transport_put(transport); } /* This is a sa interface for producing timeout events. It works * for timeouts which use the association as their parameter. */ static void sctp_generate_timeout_event(struct sctp_association *asoc, enum sctp_event_timeout timeout_type) { struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); int error = 0; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy: timer %d\n", __func__, timeout_type); /* Try again later. */ if (!mod_timer(&asoc->timers[timeout_type], jiffies + (HZ/20))) sctp_association_hold(asoc); goto out_unlock; } /* Is this association really dead and just waiting around for * the timer to let go of the reference? */ if (asoc->base.dead) goto out_unlock; /* Run through the state machine. */ error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(timeout_type), asoc->state, asoc->ep, asoc, (void *)timeout_type, GFP_ATOMIC); if (error) sk->sk_err = -error; out_unlock: bh_unlock_sock(sk); sctp_association_put(asoc); } static void sctp_generate_t1_cookie_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T1_COOKIE]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_COOKIE); } static void sctp_generate_t1_init_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T1_INIT]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_INIT); } static void sctp_generate_t2_shutdown_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T2_SHUTDOWN); } static void sctp_generate_t4_rto_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T4_RTO]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T4_RTO); } static void sctp_generate_t5_shutdown_guard_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD); } /* sctp_generate_t5_shutdown_guard_event() */ static void sctp_generate_autoclose_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_AUTOCLOSE); } /* Generate a heart beat event. If the sock is busy, reschedule. Make * sure that the transport is still valid. */ void sctp_generate_heartbeat_event(struct timer_list *t) { struct sctp_transport *transport = from_timer(transport, t, hb_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); u32 elapsed, timeout; int error = 0; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ if (!mod_timer(&transport->hb_timer, jiffies + (HZ/20))) sctp_transport_hold(transport); goto out_unlock; } /* Check if we should still send the heartbeat or reschedule */ elapsed = jiffies - transport->last_time_sent; timeout = sctp_transport_timeout(transport); if (elapsed < timeout) { elapsed = timeout - elapsed; if (!mod_timer(&transport->hb_timer, jiffies + elapsed)) sctp_transport_hold(transport); goto out_unlock; } error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_HEARTBEAT), asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); if (error) sk->sk_err = -error; out_unlock: bh_unlock_sock(sk); sctp_transport_put(transport); } /* Handle the timeout of the ICMP protocol unreachable timer. Trigger * the correct state machine transition that will close the association. */ void sctp_generate_proto_unreach_event(struct timer_list *t) { struct sctp_transport *transport = from_timer(transport, t, proto_unreach_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ if (!mod_timer(&transport->proto_unreach_timer, jiffies + (HZ/20))) sctp_transport_hold(transport); goto out_unlock; } /* Is this structure just waiting around for us to actually * get destroyed? */ if (asoc->base.dead) goto out_unlock; sctp_do_sm(net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); out_unlock: bh_unlock_sock(sk); sctp_transport_put(transport); } /* Handle the timeout of the RE-CONFIG timer. */ void sctp_generate_reconf_event(struct timer_list *t) { struct sctp_transport *transport = from_timer(transport, t, reconf_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); int error = 0; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ if (!mod_timer(&transport->reconf_timer, jiffies + (HZ / 20))) sctp_transport_hold(transport); goto out_unlock; } /* This happens when the response arrives after the timer is triggered. */ if (!asoc->strreset_chunk) goto out_unlock; error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_RECONF), asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); if (error) sk->sk_err = -error; out_unlock: bh_unlock_sock(sk); sctp_transport_put(transport); } /* Handle the timeout of the probe timer. */ void sctp_generate_probe_event(struct timer_list *t) { struct sctp_transport *transport = from_timer(transport, t, probe_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); int error = 0; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ if (!mod_timer(&transport->probe_timer, jiffies + (HZ / 20))) sctp_transport_hold(transport); goto out_unlock; } error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_PROBE), asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); if (error) sk->sk_err = -error; out_unlock: bh_unlock_sock(sk); sctp_transport_put(transport); } /* Inject a SACK Timeout event into the state machine. */ static void sctp_generate_sack_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_SACK]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_SACK); } sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = { [SCTP_EVENT_TIMEOUT_NONE] = NULL, [SCTP_EVENT_TIMEOUT_T1_COOKIE] = sctp_generate_t1_cookie_event, [SCTP_EVENT_TIMEOUT_T1_INIT] = sctp_generate_t1_init_event, [SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = sctp_generate_t2_shutdown_event, [SCTP_EVENT_TIMEOUT_T3_RTX] = NULL, [SCTP_EVENT_TIMEOUT_T4_RTO] = sctp_generate_t4_rto_event, [SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] = sctp_generate_t5_shutdown_guard_event, [SCTP_EVENT_TIMEOUT_HEARTBEAT] = NULL, [SCTP_EVENT_TIMEOUT_RECONF] = NULL, [SCTP_EVENT_TIMEOUT_SACK] = sctp_generate_sack_event, [SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sctp_generate_autoclose_event, }; /* RFC 2960 8.2 Path Failure Detection * * When its peer endpoint is multi-homed, an endpoint should keep a * error counter for each of the destination transport addresses of the * peer endpoint. * * Each time the T3-rtx timer expires on any address, or when a * HEARTBEAT sent to an idle address is not acknowledged within a RTO, * the error counter of that destination address will be incremented. * When the value in the error counter exceeds the protocol parameter * 'Path.Max.Retrans' of that destination address, the endpoint should * mark the destination transport address as inactive, and a * notification SHOULD be sent to the upper layer. * */ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands, struct sctp_association *asoc, struct sctp_transport *transport, int is_hb) { /* The check for association's overall error counter exceeding the * threshold is done in the state function. */ /* We are here due to a timer expiration. If the timer was * not a HEARTBEAT, then normal error tracking is done. * If the timer was a heartbeat, we only increment error counts * when we already have an outstanding HEARTBEAT that has not * been acknowledged. * Additionally, some tranport states inhibit error increments. */ if (!is_hb) { asoc->overall_error_count++; if (transport->state != SCTP_INACTIVE) transport->error_count++; } else if (transport->hb_sent) { if (transport->state != SCTP_UNCONFIRMED) asoc->overall_error_count++; if (transport->state != SCTP_INACTIVE) transport->error_count++; } /* If the transport error count is greater than the pf_retrans * threshold, and less than pathmaxrtx, and if the current state * is SCTP_ACTIVE, then mark this transport as Partially Failed, * see SCTP Quick Failover Draft, section 5.1 */ if (asoc->base.net->sctp.pf_enable && transport->state == SCTP_ACTIVE && transport->error_count < transport->pathmaxrxt && transport->error_count > transport->pf_retrans) { sctp_assoc_control_transport(asoc, transport, SCTP_TRANSPORT_PF, 0); /* Update the hb timer to resend a heartbeat every rto */ sctp_transport_reset_hb_timer(transport); } if (transport->state != SCTP_INACTIVE && (transport->error_count > transport->pathmaxrxt)) { pr_debug("%s: association:%p transport addr:%pISpc failed\n", __func__, asoc, &transport->ipaddr.sa); sctp_assoc_control_transport(asoc, transport, SCTP_TRANSPORT_DOWN, SCTP_FAILED_THRESHOLD); } if (transport->error_count > transport->ps_retrans && asoc->peer.primary_path == transport && asoc->peer.active_path != transport) sctp_assoc_set_primary(asoc, asoc->peer.active_path); /* E2) For the destination address for which the timer * expires, set RTO <- RTO * 2 ("back off the timer"). The * maximum value discussed in rule C7 above (RTO.max) may be * used to provide an upper bound to this doubling operation. * * Special Case: the first HB doesn't trigger exponential backoff. * The first unacknowledged HB triggers it. We do this with a flag * that indicates that we have an outstanding HB. */ if (!is_hb || transport->hb_sent) { transport->rto = min((transport->rto * 2), transport->asoc->rto_max); sctp_max_rto(asoc, transport); } } /* Worker routine to handle INIT command failure. */ static void sctp_cmd_init_failed(struct sctp_cmd_seq *commands, struct sctp_association *asoc, unsigned int error) { struct sctp_ulpevent *event; event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_CANT_STR_ASSOC, (__u16)error, 0, 0, NULL, GFP_ATOMIC); if (event) sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(event)); sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, SCTP_STATE(SCTP_STATE_CLOSED)); /* SEND_FAILED sent later when cleaning up the association. */ asoc->outqueue.error = error; sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); } /* Worker routine to handle SCTP_CMD_ASSOC_FAILED. */ static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands, struct sctp_association *asoc, enum sctp_event_type event_type, union sctp_subtype subtype, struct sctp_chunk *chunk, unsigned int error) { struct sctp_ulpevent *event; struct sctp_chunk *abort; /* Cancel any partial delivery in progress. */ asoc->stream.si->abort_pd(&asoc->ulpq, GFP_ATOMIC); if (event_type == SCTP_EVENT_T_CHUNK && subtype.chunk == SCTP_CID_ABORT) event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST, (__u16)error, 0, 0, chunk, GFP_ATOMIC); else event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST, (__u16)error, 0, 0, NULL, GFP_ATOMIC); if (event) sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(event)); if (asoc->overall_error_count >= asoc->max_retrans) { abort = sctp_make_violation_max_retrans(asoc, chunk); if (abort) sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); } sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, SCTP_STATE(SCTP_STATE_CLOSED)); /* SEND_FAILED sent later when cleaning up the association. */ asoc->outqueue.error = error; sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); } /* Process an init chunk (may be real INIT/INIT-ACK or an embedded INIT * inside the cookie. In reality, this is only used for INIT-ACK processing * since all other cases use "temporary" associations and can do all * their work in statefuns directly. */ static int sctp_cmd_process_init(struct sctp_cmd_seq *commands, struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_init_chunk *peer_init, gfp_t gfp) { int error; /* We only process the init as a sideeffect in a single * case. This is when we process the INIT-ACK. If we * fail during INIT processing (due to malloc problems), * just return the error and stop processing the stack. */ if (!sctp_process_init(asoc, chunk, sctp_source(chunk), peer_init, gfp)) error = -ENOMEM; else error = 0; return error; } /* Helper function to break out starting up of heartbeat timers. */ static void sctp_cmd_hb_timers_start(struct sctp_cmd_seq *cmds, struct sctp_association *asoc) { struct sctp_transport *t; /* Start a heartbeat timer for each transport on the association. * hold a reference on the transport to make sure none of * the needed data structures go away. */ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) sctp_transport_reset_hb_timer(t); } static void sctp_cmd_hb_timers_stop(struct sctp_cmd_seq *cmds, struct sctp_association *asoc) { struct sctp_transport *t; /* Stop all heartbeat timers. */ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (del_timer(&t->hb_timer)) sctp_transport_put(t); } } /* Helper function to stop any pending T3-RTX timers */ static void sctp_cmd_t3_rtx_timers_stop(struct sctp_cmd_seq *cmds, struct sctp_association *asoc) { struct sctp_transport *t; list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (del_timer(&t->T3_rtx_timer)) sctp_transport_put(t); } } /* Helper function to handle the reception of an HEARTBEAT ACK. */ static void sctp_cmd_transport_on(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_transport *t, struct sctp_chunk *chunk) { struct sctp_sender_hb_info *hbinfo; int was_unconfirmed = 0; /* 8.3 Upon the receipt of the HEARTBEAT ACK, the sender of the * HEARTBEAT should clear the error counter of the destination * transport address to which the HEARTBEAT was sent. */ t->error_count = 0; /* * Although RFC4960 specifies that the overall error count must * be cleared when a HEARTBEAT ACK is received, we make an * exception while in SHUTDOWN PENDING. If the peer keeps its * window shut forever, we may never be able to transmit our * outstanding data and rely on the retransmission limit be reached * to shutdown the association. */ if (t->asoc->state < SCTP_STATE_SHUTDOWN_PENDING) t->asoc->overall_error_count = 0; /* Clear the hb_sent flag to signal that we had a good * acknowledgement. */ t->hb_sent = 0; /* Mark the destination transport address as active if it is not so * marked. */ if ((t->state == SCTP_INACTIVE) || (t->state == SCTP_UNCONFIRMED)) { was_unconfirmed = 1; sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP, SCTP_HEARTBEAT_SUCCESS); } if (t->state == SCTP_PF) sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP, SCTP_HEARTBEAT_SUCCESS); /* HB-ACK was received for a the proper HB. Consider this * forward progress. */ if (t->dst) sctp_transport_dst_confirm(t); /* The receiver of the HEARTBEAT ACK should also perform an * RTT measurement for that destination transport address * using the time value carried in the HEARTBEAT ACK chunk. * If the transport's rto_pending variable has been cleared, * it was most likely due to a retransmit. However, we want * to re-enable it to properly update the rto. */ if (t->rto_pending == 0) t->rto_pending = 1; hbinfo = (struct sctp_sender_hb_info *)chunk->skb->data; sctp_transport_update_rto(t, (jiffies - hbinfo->sent_at)); /* Update the heartbeat timer. */ sctp_transport_reset_hb_timer(t); if (was_unconfirmed && asoc->peer.transport_count == 1) sctp_transport_immediate_rtx(t); } /* Helper function to process the process SACK command. */ static int sctp_cmd_process_sack(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_chunk *chunk) { int err = 0; if (sctp_outq_sack(&asoc->outqueue, chunk)) { /* There are no more TSNs awaiting SACK. */ err = sctp_do_sm(asoc->base.net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_NO_PENDING_TSN), asoc->state, asoc->ep, asoc, NULL, GFP_ATOMIC); } return err; } /* Helper function to set the timeout value for T2-SHUTDOWN timer and to set * the transport for a shutdown chunk. */ static void sctp_cmd_setup_t2(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_chunk *chunk) { struct sctp_transport *t; if (chunk->transport) t = chunk->transport; else { t = sctp_assoc_choose_alter_transport(asoc, asoc->shutdown_last_sent_to); chunk->transport = t; } asoc->shutdown_last_sent_to = t; asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = t->rto; } /* Helper function to change the state of an association. */ static void sctp_cmd_new_state(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, enum sctp_state state) { struct sock *sk = asoc->base.sk; asoc->state = state; pr_debug("%s: asoc:%p[%s]\n", __func__, asoc, sctp_state_tbl[state]); if (sctp_style(sk, TCP)) { /* Change the sk->sk_state of a TCP-style socket that has * successfully completed a connect() call. */ if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED)) inet_sk_set_state(sk, SCTP_SS_ESTABLISHED); /* Set the RCV_SHUTDOWN flag when a SHUTDOWN is received. */ if (sctp_state(asoc, SHUTDOWN_RECEIVED) && sctp_sstate(sk, ESTABLISHED)) { inet_sk_set_state(sk, SCTP_SS_CLOSING); sk->sk_shutdown |= RCV_SHUTDOWN; } } if (sctp_state(asoc, COOKIE_WAIT)) { /* Reset init timeouts since they may have been * increased due to timer expirations. */ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial; asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial; } if (sctp_state(asoc, ESTABLISHED)) { kfree(asoc->peer.cookie); asoc->peer.cookie = NULL; } if (sctp_state(asoc, ESTABLISHED) || sctp_state(asoc, CLOSED) || sctp_state(asoc, SHUTDOWN_RECEIVED)) { /* Wake up any processes waiting in the asoc's wait queue in * sctp_wait_for_connect() or sctp_wait_for_sndbuf(). */ if (waitqueue_active(&asoc->wait)) wake_up_interruptible(&asoc->wait); /* Wake up any processes waiting in the sk's sleep queue of * a TCP-style or UDP-style peeled-off socket in * sctp_wait_for_accept() or sctp_wait_for_packet(). * For a UDP-style socket, the waiters are woken up by the * notifications. */ if (!sctp_style(sk, UDP)) sk->sk_state_change(sk); } if (sctp_state(asoc, SHUTDOWN_PENDING) && !sctp_outq_is_empty(&asoc->outqueue)) sctp_outq_uncork(&asoc->outqueue, GFP_ATOMIC); } /* Helper function to delete an association. */ static void sctp_cmd_delete_tcb(struct sctp_cmd_seq *cmds, struct sctp_association *asoc) { struct sock *sk = asoc->base.sk; /* If it is a non-temporary association belonging to a TCP-style * listening socket that is not closed, do not free it so that accept() * can pick it up later. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING) && (!asoc->temp) && (sk->sk_shutdown != SHUTDOWN_MASK)) return; sctp_association_free(asoc); } /* * ADDIP Section 4.1 ASCONF Chunk Procedures * A4) Start a T-4 RTO timer, using the RTO value of the selected * destination address (we use active path instead of primary path just * because primary path may be inactive. */ static void sctp_cmd_setup_t4(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_chunk *chunk) { struct sctp_transport *t; t = sctp_assoc_choose_alter_transport(asoc, chunk->transport); asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = t->rto; chunk->transport = t; } /* Process an incoming Operation Error Chunk. */ static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_chunk *chunk) { struct sctp_errhdr *err_hdr; struct sctp_ulpevent *ev; while (chunk->chunk_end > chunk->skb->data) { err_hdr = (struct sctp_errhdr *)(chunk->skb->data); ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0, GFP_ATOMIC); if (!ev) return; asoc->stream.si->enqueue_event(&asoc->ulpq, ev); switch (err_hdr->cause) { case SCTP_ERROR_UNKNOWN_CHUNK: { struct sctp_chunkhdr *unk_chunk_hdr; unk_chunk_hdr = (struct sctp_chunkhdr *)(err_hdr + 1); switch (unk_chunk_hdr->type) { /* ADDIP 4.1 A9) If the peer responds to an ASCONF with * an ERROR chunk reporting that it did not recognized * the ASCONF chunk type, the sender of the ASCONF MUST * NOT send any further ASCONF chunks and MUST stop its * T-4 timer. */ case SCTP_CID_ASCONF: if (asoc->peer.asconf_capable == 0) break; asoc->peer.asconf_capable = 0; sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); break; default: break; } break; } default: break; } } } /* Helper function to remove the association non-primary peer * transports. */ static void sctp_cmd_del_non_primary(struct sctp_association *asoc) { struct sctp_transport *t; struct list_head *temp; struct list_head *pos; list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { t = list_entry(pos, struct sctp_transport, transports); if (!sctp_cmp_addr_exact(&t->ipaddr, &asoc->peer.primary_addr)) { sctp_assoc_rm_peer(asoc, t); } } } /* Helper function to set sk_err on a 1-1 style socket. */ static void sctp_cmd_set_sk_err(struct sctp_association *asoc, int error) { struct sock *sk = asoc->base.sk; if (!sctp_style(sk, UDP)) sk->sk_err = error; } /* Helper function to generate an association change event */ static void sctp_cmd_assoc_change(struct sctp_cmd_seq *commands, struct sctp_association *asoc, u8 state) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_assoc_change(asoc, 0, state, 0, asoc->c.sinit_num_ostreams, asoc->c.sinit_max_instreams, NULL, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } static void sctp_cmd_peer_no_auth(struct sctp_cmd_seq *commands, struct sctp_association *asoc) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, 0, SCTP_AUTH_NO_AUTH, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } /* Helper function to generate an adaptation indication event */ static void sctp_cmd_adaptation_ind(struct sctp_cmd_seq *commands, struct sctp_association *asoc) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } static void sctp_cmd_t1_timer_update(struct sctp_association *asoc, enum sctp_event_timeout timer, char *name) { struct sctp_transport *t; t = asoc->init_last_sent_to; asoc->init_err_counter++; if (t->init_sent_count > (asoc->init_cycle + 1)) { asoc->timeouts[timer] *= 2; if (asoc->timeouts[timer] > asoc->max_init_timeo) { asoc->timeouts[timer] = asoc->max_init_timeo; } asoc->init_cycle++; pr_debug("%s: T1[%s] timeout adjustment init_err_counter:%d" " cycle:%d timeout:%ld\n", __func__, name, asoc->init_err_counter, asoc->init_cycle, asoc->timeouts[timer]); } } /* Send the whole message, chunk by chunk, to the outqueue. * This way the whole message is queued up and bundling if * encouraged for small fragments. */ static void sctp_cmd_send_msg(struct sctp_association *asoc, struct sctp_datamsg *msg, gfp_t gfp) { struct sctp_chunk *chunk; list_for_each_entry(chunk, &msg->chunks, frag_list) sctp_outq_tail(&asoc->outqueue, chunk, gfp); asoc->outqueue.sched->enqueue(&asoc->outqueue, msg); } /* These three macros allow us to pull the debugging code out of the * main flow of sctp_do_sm() to keep attention focused on the real * functionality there. */ #define debug_pre_sfn() \ pr_debug("%s[pre-fn]: ep:%p, %s, %s, asoc:%p[%s], %s\n", __func__, \ ep, sctp_evttype_tbl[event_type], (*debug_fn)(subtype), \ asoc, sctp_state_tbl[state], state_fn->name) #define debug_post_sfn() \ pr_debug("%s[post-fn]: asoc:%p, status:%s\n", __func__, asoc, \ sctp_status_tbl[status]) #define debug_post_sfx() \ pr_debug("%s[post-sfx]: error:%d, asoc:%p[%s]\n", __func__, error, \ asoc, sctp_state_tbl[(asoc && sctp_id2assoc(ep->base.sk, \ sctp_assoc2id(asoc))) ? asoc->state : SCTP_STATE_CLOSED]) /* * This is the master state machine processing function. * * If you want to understand all of lksctp, this is a * good place to start. */ int sctp_do_sm(struct net *net, enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association *asoc, void *event_arg, gfp_t gfp) { typedef const char *(printfn_t)(union sctp_subtype); static printfn_t *table[] = { NULL, sctp_cname, sctp_tname, sctp_oname, sctp_pname, }; printfn_t *debug_fn __attribute__ ((unused)) = table[event_type]; const struct sctp_sm_table_entry *state_fn; struct sctp_cmd_seq commands; enum sctp_disposition status; int error = 0; /* Look up the state function, run it, and then process the * side effects. These three steps are the heart of lksctp. */ state_fn = sctp_sm_lookup_event(net, event_type, state, subtype); sctp_init_cmd_seq(&commands); debug_pre_sfn(); status = state_fn->fn(net, ep, asoc, subtype, event_arg, &commands); debug_post_sfn(); error = sctp_side_effects(event_type, subtype, state, ep, &asoc, event_arg, status, &commands, gfp); debug_post_sfx(); return error; } /***************************************************************** * This the master state function side effect processing function. *****************************************************************/ static int sctp_side_effects(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association **asoc, void *event_arg, enum sctp_disposition status, struct sctp_cmd_seq *commands, gfp_t gfp) { int error; /* FIXME - Most of the dispositions left today would be categorized * as "exceptional" dispositions. For those dispositions, it * may not be proper to run through any of the commands at all. * For example, the command interpreter might be run only with * disposition SCTP_DISPOSITION_CONSUME. */ if (0 != (error = sctp_cmd_interpreter(event_type, subtype, state, ep, *asoc, event_arg, status, commands, gfp))) goto bail; switch (status) { case SCTP_DISPOSITION_DISCARD: pr_debug("%s: ignored sctp protocol event - state:%d, " "event_type:%d, event_id:%d\n", __func__, state, event_type, subtype.chunk); break; case SCTP_DISPOSITION_NOMEM: /* We ran out of memory, so we need to discard this * packet. */ /* BUG--we should now recover some memory, probably by * reneging... */ error = -ENOMEM; break; case SCTP_DISPOSITION_DELETE_TCB: case SCTP_DISPOSITION_ABORT: /* This should now be a command. */ *asoc = NULL; break; case SCTP_DISPOSITION_CONSUME: /* * We should no longer have much work to do here as the * real work has been done as explicit commands above. */ break; case SCTP_DISPOSITION_VIOLATION: net_err_ratelimited("protocol violation state %d chunkid %d\n", state, subtype.chunk); break; case SCTP_DISPOSITION_NOT_IMPL: pr_warn("unimplemented feature in state %d, event_type %d, event_id %d\n", state, event_type, subtype.chunk); break; case SCTP_DISPOSITION_BUG: pr_err("bug in state %d, event_type %d, event_id %d\n", state, event_type, subtype.chunk); BUG(); break; default: pr_err("impossible disposition %d in state %d, event_type %d, event_id %d\n", status, state, event_type, subtype.chunk); error = status; if (error >= 0) error = -EINVAL; WARN_ON_ONCE(1); break; } bail: return error; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* This is the side-effect interpreter. */ static int sctp_cmd_interpreter(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association *asoc, void *event_arg, enum sctp_disposition status, struct sctp_cmd_seq *commands, gfp_t gfp) { struct sctp_sock *sp = sctp_sk(ep->base.sk); struct sctp_chunk *chunk = NULL, *new_obj; struct sctp_packet *packet; struct sctp_sackhdr sackh; struct timer_list *timer; struct sctp_transport *t; unsigned long timeout; struct sctp_cmd *cmd; int local_cork = 0; int error = 0; int force; if (SCTP_EVENT_T_TIMEOUT != event_type) chunk = event_arg; /* Note: This whole file is a huge candidate for rework. * For example, each command could either have its own handler, so * the loop would look like: * while (cmds) * cmd->handle(x, y, z) * --jgrimm */ while (NULL != (cmd = sctp_next_cmd(commands))) { switch (cmd->verb) { case SCTP_CMD_NOP: /* Do nothing. */ break; case SCTP_CMD_NEW_ASOC: /* Register a new association. */ if (local_cork) { sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } /* Register with the endpoint. */ asoc = cmd->obj.asoc; BUG_ON(asoc->peer.primary_path == NULL); sctp_endpoint_add_asoc(ep, asoc); break; case SCTP_CMD_PURGE_OUTQUEUE: sctp_outq_teardown(&asoc->outqueue); break; case SCTP_CMD_DELETE_TCB: if (local_cork) { sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } /* Delete the current association. */ sctp_cmd_delete_tcb(commands, asoc); asoc = NULL; break; case SCTP_CMD_NEW_STATE: /* Enter a new state. */ sctp_cmd_new_state(commands, asoc, cmd->obj.state); break; case SCTP_CMD_REPORT_TSN: /* Record the arrival of a TSN. */ error = sctp_tsnmap_mark(&asoc->peer.tsn_map, cmd->obj.u32, NULL); break; case SCTP_CMD_REPORT_FWDTSN: asoc->stream.si->report_ftsn(&asoc->ulpq, cmd->obj.u32); break; case SCTP_CMD_PROCESS_FWDTSN: asoc->stream.si->handle_ftsn(&asoc->ulpq, cmd->obj.chunk); break; case SCTP_CMD_GEN_SACK: /* Generate a Selective ACK. * The argument tells us whether to just count * the packet and MAYBE generate a SACK, or * force a SACK out. */ force = cmd->obj.i32; error = sctp_gen_sack(asoc, force, commands); break; case SCTP_CMD_PROCESS_SACK: /* Process an inbound SACK. */ error = sctp_cmd_process_sack(commands, asoc, cmd->obj.chunk); break; case SCTP_CMD_GEN_INIT_ACK: /* Generate an INIT ACK chunk. */ new_obj = sctp_make_init_ack(asoc, chunk, GFP_ATOMIC, 0); if (!new_obj) { error = -ENOMEM; break; } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); break; case SCTP_CMD_PEER_INIT: /* Process a unified INIT from the peer. * Note: Only used during INIT-ACK processing. If * there is an error just return to the outter * layer which will bail. */ error = sctp_cmd_process_init(commands, asoc, chunk, cmd->obj.init, gfp); break; case SCTP_CMD_GEN_COOKIE_ECHO: /* Generate a COOKIE ECHO chunk. */ new_obj = sctp_make_cookie_echo(asoc, chunk); if (!new_obj) { if (cmd->obj.chunk) sctp_chunk_free(cmd->obj.chunk); error = -ENOMEM; break; } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); /* If there is an ERROR chunk to be sent along with * the COOKIE_ECHO, send it, too. */ if (cmd->obj.chunk) sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(cmd->obj.chunk)); if (new_obj->transport) { new_obj->transport->init_sent_count++; asoc->init_last_sent_to = new_obj->transport; } /* FIXME - Eventually come up with a cleaner way to * enabling COOKIE-ECHO + DATA bundling during * multihoming stale cookie scenarios, the following * command plays with asoc->peer.retran_path to * avoid the problem of sending the COOKIE-ECHO and * DATA in different paths, which could result * in the association being ABORTed if the DATA chunk * is processed first by the server. Checking the * init error counter simply causes this command * to be executed only during failed attempts of * association establishment. */ if ((asoc->peer.retran_path != asoc->peer.primary_path) && (asoc->init_err_counter > 0)) { sctp_add_cmd_sf(commands, SCTP_CMD_FORCE_PRIM_RETRAN, SCTP_NULL()); } break; case SCTP_CMD_GEN_SHUTDOWN: /* Generate SHUTDOWN when in SHUTDOWN_SENT state. * Reset error counts. */ asoc->overall_error_count = 0; /* Generate a SHUTDOWN chunk. */ new_obj = sctp_make_shutdown(asoc, chunk); if (!new_obj) { error = -ENOMEM; break; } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); break; case SCTP_CMD_CHUNK_ULP: /* Send a chunk to the sockets layer. */ pr_debug("%s: sm_sideff: chunk_up:%p, ulpq:%p\n", __func__, cmd->obj.chunk, &asoc->ulpq); asoc->stream.si->ulpevent_data(&asoc->ulpq, cmd->obj.chunk, GFP_ATOMIC); break; case SCTP_CMD_EVENT_ULP: /* Send a notification to the sockets layer. */ pr_debug("%s: sm_sideff: event_up:%p, ulpq:%p\n", __func__, cmd->obj.ulpevent, &asoc->ulpq); asoc->stream.si->enqueue_event(&asoc->ulpq, cmd->obj.ulpevent); break; case SCTP_CMD_REPLY: /* If an caller has not already corked, do cork. */ if (!asoc->outqueue.cork) { sctp_outq_cork(&asoc->outqueue); local_cork = 1; } /* Send a chunk to our peer. */ sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk, gfp); break; case SCTP_CMD_SEND_PKT: /* Send a full packet to our peer. */ packet = cmd->obj.packet; sctp_packet_transmit(packet, gfp); sctp_ootb_pkt_free(packet); break; case SCTP_CMD_T1_RETRAN: /* Mark a transport for retransmission. */ sctp_retransmit(&asoc->outqueue, cmd->obj.transport, SCTP_RTXR_T1_RTX); break; case SCTP_CMD_RETRAN: /* Mark a transport for retransmission. */ sctp_retransmit(&asoc->outqueue, cmd->obj.transport, SCTP_RTXR_T3_RTX); break; case SCTP_CMD_ECN_CE: /* Do delayed CE processing. */ sctp_do_ecn_ce_work(asoc, cmd->obj.u32); break; case SCTP_CMD_ECN_ECNE: /* Do delayed ECNE processing. */ new_obj = sctp_do_ecn_ecne_work(asoc, cmd->obj.u32, chunk); if (new_obj) sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); break; case SCTP_CMD_ECN_CWR: /* Do delayed CWR processing. */ sctp_do_ecn_cwr_work(asoc, cmd->obj.u32); break; case SCTP_CMD_SETUP_T2: sctp_cmd_setup_t2(commands, asoc, cmd->obj.chunk); break; case SCTP_CMD_TIMER_START_ONCE: timer = &asoc->timers[cmd->obj.to]; if (timer_pending(timer)) break; fallthrough; case SCTP_CMD_TIMER_START: timer = &asoc->timers[cmd->obj.to]; timeout = asoc->timeouts[cmd->obj.to]; BUG_ON(!timeout); /* * SCTP has a hard time with timer starts. Because we process * timer starts as side effects, it can be hard to tell if we * have already started a timer or not, which leads to BUG * halts when we call add_timer. So here, instead of just starting * a timer, if the timer is already started, and just mod * the timer with the shorter of the two expiration times */ if (!timer_pending(timer)) sctp_association_hold(asoc); timer_reduce(timer, jiffies + timeout); break; case SCTP_CMD_TIMER_RESTART: timer = &asoc->timers[cmd->obj.to]; timeout = asoc->timeouts[cmd->obj.to]; if (!mod_timer(timer, jiffies + timeout)) sctp_association_hold(asoc); break; case SCTP_CMD_TIMER_STOP: timer = &asoc->timers[cmd->obj.to]; if (del_timer(timer)) sctp_association_put(asoc); break; case SCTP_CMD_INIT_CHOOSE_TRANSPORT: chunk = cmd->obj.chunk; t = sctp_assoc_choose_alter_transport(asoc, asoc->init_last_sent_to); asoc->init_last_sent_to = t; chunk->transport = t; t->init_sent_count++; /* Set the new transport as primary */ sctp_assoc_set_primary(asoc, t); break; case SCTP_CMD_INIT_RESTART: /* Do the needed accounting and updates * associated with restarting an initialization * timer. Only multiply the timeout by two if * all transports have been tried at the current * timeout. */ sctp_cmd_t1_timer_update(asoc, SCTP_EVENT_TIMEOUT_T1_INIT, "INIT"); sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); break; case SCTP_CMD_COOKIEECHO_RESTART: /* Do the needed accounting and updates * associated with restarting an initialization * timer. Only multiply the timeout by two if * all transports have been tried at the current * timeout. */ sctp_cmd_t1_timer_update(asoc, SCTP_EVENT_TIMEOUT_T1_COOKIE, "COOKIE"); /* If we've sent any data bundled with * COOKIE-ECHO we need to resend. */ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { sctp_retransmit_mark(&asoc->outqueue, t, SCTP_RTXR_T1_RTX); } sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE)); break; case SCTP_CMD_INIT_FAILED: sctp_cmd_init_failed(commands, asoc, cmd->obj.u16); break; case SCTP_CMD_ASSOC_FAILED: sctp_cmd_assoc_failed(commands, asoc, event_type, subtype, chunk, cmd->obj.u16); break; case SCTP_CMD_INIT_COUNTER_INC: asoc->init_err_counter++; break; case SCTP_CMD_INIT_COUNTER_RESET: asoc->init_err_counter = 0; asoc->init_cycle = 0; list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { t->init_sent_count = 0; } break; case SCTP_CMD_REPORT_DUP: sctp_tsnmap_mark_dup(&asoc->peer.tsn_map, cmd->obj.u32); break; case SCTP_CMD_REPORT_BAD_TAG: pr_debug("%s: vtag mismatch!\n", __func__); break; case SCTP_CMD_STRIKE: /* Mark one strike against a transport. */ sctp_do_8_2_transport_strike(commands, asoc, cmd->obj.transport, 0); break; case SCTP_CMD_TRANSPORT_IDLE: t = cmd->obj.transport; sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE); break; case SCTP_CMD_TRANSPORT_HB_SENT: t = cmd->obj.transport; sctp_do_8_2_transport_strike(commands, asoc, t, 1); t->hb_sent = 1; break; case SCTP_CMD_TRANSPORT_ON: t = cmd->obj.transport; sctp_cmd_transport_on(commands, asoc, t, chunk); break; case SCTP_CMD_HB_TIMERS_START: sctp_cmd_hb_timers_start(commands, asoc); break; case SCTP_CMD_HB_TIMER_UPDATE: t = cmd->obj.transport; sctp_transport_reset_hb_timer(t); break; case SCTP_CMD_HB_TIMERS_STOP: sctp_cmd_hb_timers_stop(commands, asoc); break; case SCTP_CMD_PROBE_TIMER_UPDATE: t = cmd->obj.transport; sctp_transport_reset_probe_timer(t); break; case SCTP_CMD_REPORT_ERROR: error = cmd->obj.error; break; case SCTP_CMD_PROCESS_CTSN: /* Dummy up a SACK for processing. */ sackh.cum_tsn_ack = cmd->obj.be32; sackh.a_rwnd = htonl(asoc->peer.rwnd + asoc->outqueue.outstanding_bytes); sackh.num_gap_ack_blocks = 0; sackh.num_dup_tsns = 0; chunk->subh.sack_hdr = &sackh; sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, SCTP_CHUNK(chunk)); break; case SCTP_CMD_DISCARD_PACKET: /* We need to discard the whole packet. * Uncork the queue since there might be * responses pending */ chunk->pdiscard = 1; if (asoc) { sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } break; case SCTP_CMD_RTO_PENDING: t = cmd->obj.transport; t->rto_pending = 1; break; case SCTP_CMD_PART_DELIVER: asoc->stream.si->start_pd(&asoc->ulpq, GFP_ATOMIC); break; case SCTP_CMD_RENEGE: asoc->stream.si->renege_events(&asoc->ulpq, cmd->obj.chunk, GFP_ATOMIC); break; case SCTP_CMD_SETUP_T4: sctp_cmd_setup_t4(commands, asoc, cmd->obj.chunk); break; case SCTP_CMD_PROCESS_OPERR: sctp_cmd_process_operr(commands, asoc, chunk); break; case SCTP_CMD_CLEAR_INIT_TAG: asoc->peer.i.init_tag = 0; break; case SCTP_CMD_DEL_NON_PRIMARY: sctp_cmd_del_non_primary(asoc); break; case SCTP_CMD_T3_RTX_TIMERS_STOP: sctp_cmd_t3_rtx_timers_stop(commands, asoc); break; case SCTP_CMD_FORCE_PRIM_RETRAN: t = asoc->peer.retran_path; asoc->peer.retran_path = asoc->peer.primary_path; sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; asoc->peer.retran_path = t; break; case SCTP_CMD_SET_SK_ERR: sctp_cmd_set_sk_err(asoc, cmd->obj.error); break; case SCTP_CMD_ASSOC_CHANGE: sctp_cmd_assoc_change(commands, asoc, cmd->obj.u8); break; case SCTP_CMD_ADAPTATION_IND: sctp_cmd_adaptation_ind(commands, asoc); break; case SCTP_CMD_PEER_NO_AUTH: sctp_cmd_peer_no_auth(commands, asoc); break; case SCTP_CMD_ASSOC_SHKEY: error = sctp_auth_asoc_init_active_key(asoc, GFP_ATOMIC); break; case SCTP_CMD_UPDATE_INITTAG: asoc->peer.i.init_tag = cmd->obj.u32; break; case SCTP_CMD_SEND_MSG: if (!asoc->outqueue.cork) { sctp_outq_cork(&asoc->outqueue); local_cork = 1; } sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp); break; case SCTP_CMD_PURGE_ASCONF_QUEUE: sctp_asconf_queue_teardown(asoc); break; case SCTP_CMD_SET_ASOC: if (asoc && local_cork) { sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } asoc = cmd->obj.asoc; break; default: pr_warn("Impossible command: %u\n", cmd->verb); break; } if (error) { cmd = sctp_next_cmd(commands); while (cmd) { if (cmd->verb == SCTP_CMD_REPLY) sctp_chunk_free(cmd->obj.chunk); cmd = sctp_next_cmd(commands); } break; } } /* If this is in response to a received chunk, wait until * we are done with the packet to open the queue so that we don't * send multiple packets in response to a single request. */ if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) { if (chunk->end_of_packet || chunk->singleton) sctp_outq_uncork(&asoc->outqueue, gfp); } else if (local_cork) sctp_outq_uncork(&asoc->outqueue, gfp); if (sp->data_ready_signalled) sp->data_ready_signalled = 0; return error; }
1 1 1 1 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2004 */ /* * jfs_umount.c * * note: file system in transition to aggregate/fileset: * (ref. jfs_mount.c) * * file system unmount is interpreted as mount of the single/only * fileset in the aggregate and, if unmount of the last fileset, * as unmount of the aggerate; */ #include <linux/fs.h> #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_superblock.h" #include "jfs_dmap.h" #include "jfs_imap.h" #include "jfs_metapage.h" #include "jfs_debug.h" /* * NAME: jfs_umount(vfsp, flags, crp) * * FUNCTION: vfs_umount() * * PARAMETERS: vfsp - virtual file system pointer * flags - unmount for shutdown * crp - credential * * RETURN : EBUSY - device has open files */ int jfs_umount(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct inode *ipbmap = sbi->ipbmap; struct inode *ipimap = sbi->ipimap; struct inode *ipaimap = sbi->ipaimap; struct inode *ipaimap2 = sbi->ipaimap2; struct jfs_log *log; int rc = 0; jfs_info("UnMount JFS: sb:0x%p", sb); /* * update superblock and close log * * if mounted read-write and log based recovery was enabled */ if ((log = sbi->log)) /* * Wait for outstanding transactions to be written to log: */ jfs_flush_journal(log, 2); /* * close fileset inode allocation map (aka fileset inode) */ diUnmount(ipimap, 0); diFreeSpecial(ipimap); sbi->ipimap = NULL; /* * close secondary aggregate inode allocation map */ if (ipaimap2) { diUnmount(ipaimap2, 0); diFreeSpecial(ipaimap2); sbi->ipaimap2 = NULL; } /* * close aggregate inode allocation map */ diUnmount(ipaimap, 0); diFreeSpecial(ipaimap); sbi->ipaimap = NULL; /* * close aggregate block allocation map */ dbUnmount(ipbmap, 0); diFreeSpecial(ipbmap); sbi->ipbmap = NULL; /* * Make sure all metadata makes it to disk before we mark * the superblock as clean */ filemap_write_and_wait(sbi->direct_inode->i_mapping); /* * ensure all file system file pages are propagated to their * home blocks on disk (and their in-memory buffer pages are * invalidated) BEFORE updating file system superblock state * (to signify file system is unmounted cleanly, and thus in * consistent state) and log superblock active file system * list (to signify skip logredo()). */ if (log) { /* log = NULL if read-only mount */ updateSuper(sb, FM_CLEAN); /* * close log: * * remove file system from log active file system list. */ rc = lmLogClose(sb); } jfs_info("UnMount JFS Complete: rc = %d", rc); return rc; } int jfs_umount_rw(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; if (!log) return 0; /* * close log: * * remove file system from log active file system list. */ jfs_flush_journal(log, 2); /* * Make sure all metadata makes it to disk */ dbSync(sbi->ipbmap); diSync(sbi->ipimap); /* * Note that we have to do this even if sync_blockdev() will * do exactly the same a few instructions later: We can't * mark the superblock clean before everything is flushed to * disk. */ filemap_write_and_wait(sbi->direct_inode->i_mapping); updateSuper(sb, FM_CLEAN); return lmLogClose(sb); }
6 5 1 11 6 11 1 11 8 6 10 10 10 1 1 11 10 1 1 3 36 2 14 2 18 3 5 23 25 6 3 1 7 16 2 1 19 1 1 1 1 1 1 5 5 23 6 6 4 21 1 4 4 13 13 6 6 4 11 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/plist.h> #include <linux/sched/signal.h> #include "futex.h" #include "../locking/rtmutex_common.h" /* * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an * underlying rtmutex. The task which is about to be requeued could have * just woken up (timeout, signal). After the wake up the task has to * acquire hash bucket lock, which is held by the requeue code. As a task * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking * and the hash bucket lock blocking would collide and corrupt state. * * On !PREEMPT_RT this is not a problem and everything could be serialized * on hash bucket lock, but aside of having the benefit of common code, * this allows to avoid doing the requeue when the task is already on the * way out and taking the hash bucket lock of the original uaddr1 when the * requeue has been completed. * * The following state transitions are valid: * * On the waiter side: * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT * * On the requeue side: * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed) * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed) * * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this * signals that the waiter is already on the way out. It also means that * the waiter is still on the 'wait' futex, i.e. uaddr1. * * The waiter side signals early wakeup to the requeue side either through * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT, * which means the wakeup is interleaving with a requeue in progress it has * to wait for the requeue side to change the state. Either to DONE/LOCKED * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by * the requeue side when the requeue attempt failed via deadlock detection * and therefore the waiter q is still on the uaddr1 futex. */ enum { Q_REQUEUE_PI_NONE = 0, Q_REQUEUE_PI_IGNORE, Q_REQUEUE_PI_IN_PROGRESS, Q_REQUEUE_PI_WAIT, Q_REQUEUE_PI_DONE, Q_REQUEUE_PI_LOCKED, }; const struct futex_q futex_q_init = { /* list gets initialized in futex_queue()*/ .wake = futex_wake_mark, .key = FUTEX_KEY_INIT, .bitset = FUTEX_BITSET_MATCH_ANY, .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE), }; /** * requeue_futex() - Requeue a futex_q from one hb to another * @q: the futex_q to requeue * @hb1: the source hash_bucket * @hb2: the target hash_bucket * @key2: the new key for the requeued futex_q */ static inline void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2, union futex_key *key2) { /* * If key1 and key2 hash to the same bucket, no need to * requeue. */ if (likely(&hb1->chain != &hb2->chain)) { plist_del(&q->list, &hb1->chain); futex_hb_waiters_dec(hb1); futex_hb_waiters_inc(hb2); plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; } q->key = *key2; } static inline bool futex_requeue_pi_prepare(struct futex_q *q, struct futex_pi_state *pi_state) { int old, new; /* * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has * already set Q_REQUEUE_PI_IGNORE to signal that requeue should * ignore the waiter. */ old = atomic_read_acquire(&q->requeue_state); do { if (old == Q_REQUEUE_PI_IGNORE) return false; /* * futex_proxy_trylock_atomic() might have set it to * IN_PROGRESS and a interleaved early wake to WAIT. * * It was considered to have an extra state for that * trylock, but that would just add more conditionals * all over the place for a dubious value. */ if (old != Q_REQUEUE_PI_NONE) break; new = Q_REQUEUE_PI_IN_PROGRESS; } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); q->pi_state = pi_state; return true; } static inline void futex_requeue_pi_complete(struct futex_q *q, int locked) { int old, new; old = atomic_read_acquire(&q->requeue_state); do { if (old == Q_REQUEUE_PI_IGNORE) return; if (locked >= 0) { /* Requeue succeeded. Set DONE or LOCKED */ WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS && old != Q_REQUEUE_PI_WAIT); new = Q_REQUEUE_PI_DONE + locked; } else if (old == Q_REQUEUE_PI_IN_PROGRESS) { /* Deadlock, no early wakeup interleave */ new = Q_REQUEUE_PI_NONE; } else { /* Deadlock, early wakeup interleave. */ WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT); new = Q_REQUEUE_PI_IGNORE; } } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); #ifdef CONFIG_PREEMPT_RT /* If the waiter interleaved with the requeue let it know */ if (unlikely(old == Q_REQUEUE_PI_WAIT)) rcuwait_wake_up(&q->requeue_wait); #endif } static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q) { int old, new; old = atomic_read_acquire(&q->requeue_state); do { /* Is requeue done already? */ if (old >= Q_REQUEUE_PI_DONE) return old; /* * If not done, then tell the requeue code to either ignore * the waiter or to wake it up once the requeue is done. */ new = Q_REQUEUE_PI_WAIT; if (old == Q_REQUEUE_PI_NONE) new = Q_REQUEUE_PI_IGNORE; } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); /* If the requeue was in progress, wait for it to complete */ if (old == Q_REQUEUE_PI_IN_PROGRESS) { #ifdef CONFIG_PREEMPT_RT rcuwait_wait_event(&q->requeue_wait, atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT, TASK_UNINTERRUPTIBLE); #else (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT); #endif } /* * Requeue is now either prohibited or complete. Reread state * because during the wait above it might have changed. Nothing * will modify q->requeue_state after this point. */ return atomic_read(&q->requeue_state); } /** * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue * @q: the futex_q * @key: the key of the requeue target futex * @hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the * target futex if it is uncontended or via a lock steal. * * 1) Set @q::key to the requeue target futex key so the waiter can detect * the wakeup on the right futex. * * 2) Dequeue @q from the hash bucket. * * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock * acquisition. * * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that * the waiter has to fixup the pi state. * * 5) Complete the requeue state so the waiter can make progress. After * this point the waiter task can return from the syscall immediately in * case that the pi state does not have to be fixed up. * * 6) Wake the waiter task. * * Must be called with both q->lock_ptr and hb->lock held. */ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { q->key = *key; __futex_unqueue(q); WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; q->lock_ptr = &hb->lock; /* Signal locked state to the waiter */ futex_requeue_pi_complete(q, 1); wake_up_state(q->task, TASK_NORMAL); } /** * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter * @pifutex: the user address of the to futex * @hb1: the from futex hash bucket, must be locked by the caller * @hb2: the to futex hash bucket, must be locked by the caller * @key1: the from futex key * @key2: the to futex key * @ps: address to store the pi_state pointer * @exiting: Pointer to store the task pointer of the owner task * which is in the middle of exiting * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Try and get the lock on behalf of the top waiter if we can do it atomically. * Wake the top waiter if we succeed. If the caller specified set_waiters, * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. * hb1 and hb2 must be held by the caller. * * @exiting is only set when the return value is -EBUSY. If so, this holds * a refcount on the exiting task on return and the caller needs to drop it * after waiting for the exit to complete. * * Return: * - 0 - failed to acquire the lock atomically; * - >0 - acquired the lock, return value is vpid of the top_waiter * - <0 - error */ static int futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2, union futex_key *key1, union futex_key *key2, struct futex_pi_state **ps, struct task_struct **exiting, int set_waiters) { struct futex_q *top_waiter; u32 curval; int ret; if (futex_get_value_locked(&curval, pifutex)) return -EFAULT; if (unlikely(should_fail_futex(true))) return -EFAULT; /* * Find the top_waiter and determine if there are additional waiters. * If the caller intends to requeue more than 1 waiter to pifutex, * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, * as we have means to handle the possible fault. If not, don't set * the bit unnecessarily as it will force the subsequent unlock to enter * the kernel. */ top_waiter = futex_top_waiter(hb1, key1); /* There are no waiters, nothing for us to do. */ if (!top_waiter) return 0; /* * Ensure that this is a waiter sitting in futex_wait_requeue_pi() * and waiting on the 'waitqueue' futex which is always !PI. */ if (!top_waiter->rt_waiter || top_waiter->pi_state) return -EINVAL; /* Ensure we requeue to the expected futex. */ if (!futex_match(top_waiter->requeue_pi_key, key2)) return -EINVAL; /* Ensure that this does not race against an early wakeup */ if (!futex_requeue_pi_prepare(top_waiter, NULL)) return -EAGAIN; /* * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit * in the contended case or if @set_waiters is true. * * In the contended case PI state is attached to the lock owner. If * the user space lock can be acquired then PI state is attached to * the new owner (@top_waiter->task) when @set_waiters is true. */ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, exiting, set_waiters); if (ret == 1) { /* * Lock was acquired in user space and PI state was * attached to @top_waiter->task. That means state is fully * consistent and the waiter can return to user space * immediately after the wakeup. */ requeue_pi_wake_futex(top_waiter, key2, hb2); } else if (ret < 0) { /* Rewind top_waiter::requeue_state */ futex_requeue_pi_complete(top_waiter, ret); } else { /* * futex_lock_pi_atomic() did not acquire the user space * futex, but managed to establish the proxy lock and pi * state. top_waiter::requeue_state cannot be fixed up here * because the waiter is not enqueued on the rtmutex * yet. This is handled at the callsite depending on the * result of rt_mutex_start_proxy_lock() which is * guaranteed to be reached with this function returning 0. */ } return ret; } /** * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 * @uaddr1: source futex user address * @flags1: futex flags (FLAGS_SHARED, etc.) * @uaddr2: target futex user address * @flags2: futex flags (FLAGS_SHARED, etc.) * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) * @nr_requeue: number of waiters to requeue (0-INT_MAX) * @cmpval: @uaddr1 expected value (or %NULL) * @requeue_pi: if we are attempting to requeue from a non-pi futex to a * pi futex (pi to pi requeue is not supported) * * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire * uaddr2 atomically on behalf of the top waiter. * * Return: * - >=0 - on success, the number of tasks requeued or woken; * - <0 - on error */ int futex_requeue(u32 __user *uaddr1, unsigned int flags1, u32 __user *uaddr2, unsigned int flags2, int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; int task_count = 0, ret; struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); if (nr_wake < 0 || nr_requeue < 0) return -EINVAL; /* * When PI not supported: return -ENOSYS if requeue_pi is true, * consequently the compiler knows requeue_pi is always false past * this point which will optimize away all the conditional code * further down. */ if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) return -ENOSYS; if (requeue_pi) { /* * Requeue PI only works on two distinct uaddrs. This * check is only valid for private futexes. See below. */ if (uaddr1 == uaddr2) return -EINVAL; /* * futex_requeue() allows the caller to define the number * of waiters to wake up via the @nr_wake argument. With * REQUEUE_PI, waking up more than one waiter is creating * more problems than it solves. Waking up a waiter makes * only sense if the PI futex @uaddr2 is uncontended as * this allows the requeue code to acquire the futex * @uaddr2 before waking the waiter. The waiter can then * return to user space without further action. A secondary * wakeup would just make the futex_wait_requeue_pi() * handling more complex, because that code would have to * look up pi_state and do more or less all the handling * which the requeue code has to do for the to be requeued * waiters. So restrict the number of waiters to wake to * one, and only wake it up when the PI futex is * uncontended. Otherwise requeue it and let the unlock of * the PI futex handle the wakeup. * * All REQUEUE_PI users, e.g. pthread_cond_signal() and * pthread_cond_broadcast() must use nr_wake=1. */ if (nr_wake != 1) return -EINVAL; /* * requeue_pi requires a pi_state, try to allocate it now * without any locks in case it fails. */ if (refill_pi_state_cache()) return -ENOMEM; } retry: ret = get_futex_key(uaddr1, flags1, &key1, FUTEX_READ); if (unlikely(ret != 0)) return ret; ret = get_futex_key(uaddr2, flags2, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) return ret; /* * The check above which compares uaddrs is not sufficient for * shared futexes. We need to compare the keys: */ if (requeue_pi && futex_match(&key1, &key2)) return -EINVAL; hb1 = futex_hash(&key1); hb2 = futex_hash(&key2); retry_private: futex_hb_waiters_inc(hb2); double_lock_hb(hb1, hb2); if (likely(cmpval != NULL)) { u32 curval; ret = futex_get_value_locked(&curval, uaddr1); if (unlikely(ret)) { double_unlock_hb(hb1, hb2); futex_hb_waiters_dec(hb2); ret = get_user(curval, uaddr1); if (ret) return ret; if (!(flags1 & FLAGS_SHARED)) goto retry_private; goto retry; } if (curval != *cmpval) { ret = -EAGAIN; goto out_unlock; } } if (requeue_pi) { struct task_struct *exiting = NULL; /* * Attempt to acquire uaddr2 and wake the top waiter. If we * intend to requeue waiters, force setting the FUTEX_WAITERS * bit. We force this here where we are able to easily handle * faults rather in the requeue loop below. * * Updates topwaiter::requeue_state if a top waiter exists. */ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, &key2, &pi_state, &exiting, nr_requeue); /* * At this point the top_waiter has either taken uaddr2 or * is waiting on it. In both cases pi_state has been * established and an initial refcount on it. In case of an * error there's nothing. * * The top waiter's requeue_state is up to date: * * - If the lock was acquired atomically (ret == 1), then * the state is Q_REQUEUE_PI_LOCKED. * * The top waiter has been dequeued and woken up and can * return to user space immediately. The kernel/user * space state is consistent. In case that there must be * more waiters requeued the WAITERS bit in the user * space futex is set so the top waiter task has to go * into the syscall slowpath to unlock the futex. This * will block until this requeue operation has been * completed and the hash bucket locks have been * dropped. * * - If the trylock failed with an error (ret < 0) then * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing * happened", or Q_REQUEUE_PI_IGNORE when there was an * interleaved early wakeup. * * - If the trylock did not succeed (ret == 0) then the * state is either Q_REQUEUE_PI_IN_PROGRESS or * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. * This will be cleaned up in the loop below, which * cannot fail because futex_proxy_trylock_atomic() did * the same sanity checks for requeue_pi as the loop * below does. */ switch (ret) { case 0: /* We hold a reference on the pi state. */ break; case 1: /* * futex_proxy_trylock_atomic() acquired the user space * futex. Adjust task_count. */ task_count++; ret = 0; break; /* * If the above failed, then pi_state is NULL and * waiter::requeue_state is correct. */ case -EFAULT: double_unlock_hb(hb1, hb2); futex_hb_waiters_dec(hb2); ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; return ret; case -EBUSY: case -EAGAIN: /* * Two reasons for this: * - EBUSY: Owner is exiting and we just wait for the * exit to complete. * - EAGAIN: The user space value changed. */ double_unlock_hb(hb1, hb2); futex_hb_waiters_dec(hb2); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise * this task might loop forever, aka. live lock. */ wait_for_owner_exiting(ret, exiting); cond_resched(); goto retry; default: goto out_unlock; } } plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (task_count - nr_wake >= nr_requeue) break; if (!futex_match(&this->key, &key1)) continue; /* * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always * be paired with each other and no other futex ops. * * We should never be requeueing a futex_q with a pi_state, * which is awaiting a futex_unlock_pi(). */ if ((requeue_pi && !this->rt_waiter) || (!requeue_pi && this->rt_waiter) || this->pi_state) { ret = -EINVAL; break; } /* Plain futexes just wake or requeue and are done */ if (!requeue_pi) { if (++task_count <= nr_wake) this->wake(&wake_q, this); else requeue_futex(this, hb1, hb2, &key2); continue; } /* Ensure we requeue to the expected futex for requeue_pi. */ if (!futex_match(this->requeue_pi_key, &key2)) { ret = -EINVAL; break; } /* * Requeue nr_requeue waiters and possibly one more in the case * of requeue_pi if we couldn't acquire the lock atomically. * * Prepare the waiter to take the rt_mutex. Take a refcount * on the pi_state and store the pointer in the futex_q * object of the waiter. */ get_pi_state(pi_state); /* Don't requeue when the waiter is already on the way out. */ if (!futex_requeue_pi_prepare(this, pi_state)) { /* * Early woken waiter signaled that it is on the * way out. Drop the pi_state reference and try the * next waiter. @this->pi_state is still NULL. */ put_pi_state(pi_state); continue; } ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, this->rt_waiter, this->task); if (ret == 1) { /* * We got the lock. We do neither drop the refcount * on pi_state nor clear this->pi_state because the * waiter needs the pi_state for cleaning up the * user space value. It will drop the refcount * after doing so. this::requeue_state is updated * in the wakeup as well. */ requeue_pi_wake_futex(this, &key2, hb2); task_count++; } else if (!ret) { /* Waiter is queued, move it to hb2 */ requeue_futex(this, hb1, hb2, &key2); futex_requeue_pi_complete(this, 0); task_count++; } else { /* * rt_mutex_start_proxy_lock() detected a potential * deadlock when we tried to queue that waiter. * Drop the pi_state reference which we took above * and remove the pointer to the state from the * waiters futex_q object. */ this->pi_state = NULL; put_pi_state(pi_state); futex_requeue_pi_complete(this, ret); /* * We stop queueing more waiters and let user space * deal with the mess. */ break; } } /* * We took an extra initial reference to the pi_state in * futex_proxy_trylock_atomic(). We need to drop it here again. */ put_pi_state(pi_state); out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); futex_hb_waiters_dec(hb2); return ret ? ret : task_count; } /** * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex * @hb: the hash_bucket futex_q was original enqueued on * @q: the futex_q woken while waiting to be requeued * @timeout: the timeout associated with the wait (NULL if none) * * Determine the cause for the early wakeup. * * Return: * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, struct futex_q *q, struct hrtimer_sleeper *timeout) { int ret; /* * With the hb lock held, we avoid races while we process the wakeup. * We only need to hold hb (and not hb2) to ensure atomicity as the * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. * It can't be requeued from uaddr2 to something else since we don't * support a PI aware source futex for requeue. */ WARN_ON_ONCE(&hb->lock != q->lock_ptr); /* * We were woken prior to requeue by a timeout or a signal. * Unqueue the futex_q and determine which it was. */ plist_del(&q->list, &hb->chain); futex_hb_waiters_dec(hb); /* Handle spurious wakeups gracefully */ ret = -EWOULDBLOCK; if (timeout && !timeout->task) ret = -ETIMEDOUT; else if (signal_pending(current)) ret = -ERESTARTNOINTR; return ret; } /** * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 * @uaddr: the futex we initially wait on (non-pi) * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be * the same type, no requeueing from private to shared, etc. * @val: the expected value of uaddr * @abs_time: absolute timeout * @bitset: 32 bit wakeup bitset set by userspace, defaults to all * @uaddr2: the pi futex we will take prior to returning to user-space * * The caller will wait on uaddr and will be requeued by futex_requeue() to * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to * userspace. This ensures the rt_mutex maintains an owner when it has waiters; * without one, the pi logic would not know which task to boost/deboost, if * there was a need to. * * We call schedule in futex_wait_queue() when we enqueue and return there * via the following-- * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() * 2) wakeup on uaddr2 after a requeue * 3) signal * 4) timeout * * If 3, cleanup and return -ERESTARTNOINTR. * * If 2, we may then block on trying to take the rt_mutex and return via: * 5) successful lock * 6) signal * 7) timeout * 8) other lock acquisition failure * * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). * * If 4 or 7, we cleanup and return with -ETIMEDOUT. * * Return: * - 0 - On success; * - <0 - On error */ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, u32 __user *uaddr2) { struct hrtimer_sleeper timeout, *to; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; struct rt_mutex_base *pi_mutex; int res, ret; if (!IS_ENABLED(CONFIG_FUTEX_PI)) return -ENOSYS; if (uaddr == uaddr2) return -EINVAL; if (!bitset) return -EINVAL; to = futex_setup_timer(abs_time, &timeout, flags, current->timer_slack_ns); /* * The waiter is allocated on our stack, manipulated by the requeue * code while we sleep on uaddr. */ rt_mutex_init_waiter(&rt_waiter); ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) goto out; q.bitset = bitset; q.rt_waiter = &rt_waiter; q.requeue_pi_key = &key2; /* * Prepare to wait on uaddr. On success, it holds hb->lock and q * is initialized. */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) goto out; /* * The check above which compares uaddrs is not sufficient for * shared futexes. We need to compare the keys: */ if (futex_match(&q.key, &key2)) { futex_q_unlock(hb); ret = -EINVAL; goto out; } /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_wait_queue(hb, &q, to); switch (futex_requeue_pi_wakeup_sync(&q)) { case Q_REQUEUE_PI_IGNORE: /* The waiter is still on uaddr1 */ spin_lock(&hb->lock); ret = handle_early_requeue_pi_wakeup(hb, &q, to); spin_unlock(&hb->lock); break; case Q_REQUEUE_PI_LOCKED: /* The requeue acquired the lock */ if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_pi_owner(uaddr2, &q, true); /* * Drop the reference to the pi state which the * requeue_pi() code acquired for us. */ put_pi_state(q.pi_state); spin_unlock(q.lock_ptr); /* * Adjust the return value. It's either -EFAULT or * success (1) but the caller expects 0 for success. */ ret = ret < 0 ? ret : 0; } break; case Q_REQUEUE_PI_DONE: /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */ pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); /* * See futex_unlock_pi()'s cleanup: comment. */ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) ret = 0; spin_lock(q.lock_ptr); debug_rt_mutex_free_waiter(&rt_waiter); /* * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. */ res = fixup_pi_owner(uaddr2, &q, !ret); /* * If fixup_pi_owner() returned an error, propagate that. If it * acquired the lock, clear -ETIMEDOUT or -EINTR. */ if (res) ret = (res < 0) ? res : 0; futex_unqueue_pi(&q); spin_unlock(q.lock_ptr); if (ret == -EINTR) { /* * We've already been requeued, but cannot restart * by calling futex_lock_pi() directly. We could * restart this syscall, but it would detect that * the user space "val" changed and return * -EWOULDBLOCK. Save the overhead of the restart * and return -EWOULDBLOCK directly. */ ret = -EWOULDBLOCK; } break; default: BUG(); } out: if (to) { hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); } return ret; }
1183 5534 31 102 18821 7536 17155 6072 2123 2604 4344 5019 7215 2930 6526 6527 6527 25 804 63 2983 3121 1237 268 1879 2 1876 4511 248 1976 3800 504 1053 718 407 644 645 23 6 1365 1150 1150 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_XARRAY_H #define _LINUX_XARRAY_H /* * eXtensible Arrays * Copyright (c) 2017 Microsoft Corporation * Author: Matthew Wilcox <willy@infradead.org> * * See Documentation/core-api/xarray.rst for how to use the XArray. */ #include <linux/bitmap.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/gfp.h> #include <linux/kconfig.h> #include <linux/kernel.h> #include <linux/rcupdate.h> #include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/types.h> /* * The bottom two bits of the entry determine how the XArray interprets * the contents: * * 00: Pointer entry * 10: Internal entry * x1: Value entry or tagged pointer * * Attempting to store internal entries in the XArray is a bug. * * Most internal entries are pointers to the next node in the tree. * The following internal entries have a special meaning: * * 0-62: Sibling entries * 256: Retry entry * 257: Zero entry * * Errors are also represented as internal entries, but use the negative * space (-4094 to -2). They're never stored in the slots array; only * returned by the normal API. */ #define BITS_PER_XA_VALUE (BITS_PER_LONG - 1) /** * xa_mk_value() - Create an XArray entry from an integer. * @v: Value to store in XArray. * * Context: Any context. * Return: An entry suitable for storing in the XArray. */ static inline void *xa_mk_value(unsigned long v) { WARN_ON((long)v < 0); return (void *)((v << 1) | 1); } /** * xa_to_value() - Get value stored in an XArray entry. * @entry: XArray entry. * * Context: Any context. * Return: The value stored in the XArray entry. */ static inline unsigned long xa_to_value(const void *entry) { return (unsigned long)entry >> 1; } /** * xa_is_value() - Determine if an entry is a value. * @entry: XArray entry. * * Context: Any context. * Return: True if the entry is a value, false if it is a pointer. */ static inline bool xa_is_value(const void *entry) { return (unsigned long)entry & 1; } /** * xa_tag_pointer() - Create an XArray entry for a tagged pointer. * @p: Plain pointer. * @tag: Tag value (0, 1 or 3). * * If the user of the XArray prefers, they can tag their pointers instead * of storing value entries. Three tags are available (0, 1 and 3). * These are distinct from the xa_mark_t as they are not replicated up * through the array and cannot be searched for. * * Context: Any context. * Return: An XArray entry. */ static inline void *xa_tag_pointer(void *p, unsigned long tag) { return (void *)((unsigned long)p | tag); } /** * xa_untag_pointer() - Turn an XArray entry into a plain pointer. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the untagged version of the pointer. * * Context: Any context. * Return: A pointer. */ static inline void *xa_untag_pointer(void *entry) { return (void *)((unsigned long)entry & ~3UL); } /** * xa_pointer_tag() - Get the tag stored in an XArray entry. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the tag of that pointer. * * Context: Any context. * Return: A tag. */ static inline unsigned int xa_pointer_tag(void *entry) { return (unsigned long)entry & 3UL; } /* * xa_mk_internal() - Create an internal entry. * @v: Value to turn into an internal entry. * * Internal entries are used for a number of purposes. Entries 0-255 are * used for sibling entries (only 0-62 are used by the current code). 256 * is used for the retry entry. 257 is used for the reserved / zero entry. * Negative internal entries are used to represent errnos. Node pointers * are also tagged as internal entries in some situations. * * Context: Any context. * Return: An XArray internal entry corresponding to this value. */ static inline void *xa_mk_internal(unsigned long v) { return (void *)((v << 2) | 2); } /* * xa_to_internal() - Extract the value from an internal entry. * @entry: XArray entry. * * Context: Any context. * Return: The value which was stored in the internal entry. */ static inline unsigned long xa_to_internal(const void *entry) { return (unsigned long)entry >> 2; } /* * xa_is_internal() - Is the entry an internal entry? * @entry: XArray entry. * * Context: Any context. * Return: %true if the entry is an internal entry. */ static inline bool xa_is_internal(const void *entry) { return ((unsigned long)entry & 3) == 2; } #define XA_ZERO_ENTRY xa_mk_internal(257) /** * xa_is_zero() - Is the entry a zero entry? * @entry: Entry retrieved from the XArray * * The normal API will return NULL as the contents of a slot containing * a zero entry. You can only see zero entries by using the advanced API. * * Return: %true if the entry is a zero entry. */ static inline bool xa_is_zero(const void *entry) { return unlikely(entry == XA_ZERO_ENTRY); } /** * xa_is_err() - Report whether an XArray operation returned an error * @entry: Result from calling an XArray function * * If an XArray operation cannot complete an operation, it will return * a special value indicating an error. This function tells you * whether an error occurred; xa_err() tells you which error occurred. * * Context: Any context. * Return: %true if the entry indicates an error. */ static inline bool xa_is_err(const void *entry) { return unlikely(xa_is_internal(entry) && entry >= xa_mk_internal(-MAX_ERRNO)); } /** * xa_err() - Turn an XArray result into an errno. * @entry: Result from calling an XArray function. * * If an XArray operation cannot complete an operation, it will return * a special pointer value which encodes an errno. This function extracts * the errno from the pointer value, or returns 0 if the pointer does not * represent an errno. * * Context: Any context. * Return: A negative errno or 0. */ static inline int xa_err(void *entry) { /* xa_to_internal() would not do sign extension. */ if (xa_is_err(entry)) return (long)entry >> 2; return 0; } /** * struct xa_limit - Represents a range of IDs. * @min: The lowest ID to allocate (inclusive). * @max: The maximum ID to allocate (inclusive). * * This structure is used either directly or via the XA_LIMIT() macro * to communicate the range of IDs that are valid for allocation. * Three common ranges are predefined for you: * * xa_limit_32b - [0 - UINT_MAX] * * xa_limit_31b - [0 - INT_MAX] * * xa_limit_16b - [0 - USHRT_MAX] */ struct xa_limit { u32 max; u32 min; }; #define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max } #define xa_limit_32b XA_LIMIT(0, UINT_MAX) #define xa_limit_31b XA_LIMIT(0, INT_MAX) #define xa_limit_16b XA_LIMIT(0, USHRT_MAX) typedef unsigned __bitwise xa_mark_t; #define XA_MARK_0 ((__force xa_mark_t)0U) #define XA_MARK_1 ((__force xa_mark_t)1U) #define XA_MARK_2 ((__force xa_mark_t)2U) #define XA_PRESENT ((__force xa_mark_t)8U) #define XA_MARK_MAX XA_MARK_2 #define XA_FREE_MARK XA_MARK_0 enum xa_lock_type { XA_LOCK_IRQ = 1, XA_LOCK_BH = 2, }; /* * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags, * and we remain compatible with that. */ #define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ) #define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH) #define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U) #define XA_FLAGS_ZERO_BUSY ((__force gfp_t)8U) #define XA_FLAGS_ALLOC_WRAPPED ((__force gfp_t)16U) #define XA_FLAGS_ACCOUNT ((__force gfp_t)32U) #define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ (__force unsigned)(mark))) /* ALLOC is for a normal 0-based alloc. ALLOC1 is for an 1-based alloc */ #define XA_FLAGS_ALLOC (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK)) #define XA_FLAGS_ALLOC1 (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY) /** * struct xarray - The anchor of the XArray. * @xa_lock: Lock that protects the contents of the XArray. * * To use the xarray, define it statically or embed it in your data structure. * It is a very small data structure, so it does not usually make sense to * allocate it separately and keep a pointer to it in your data structure. * * You may use the xa_lock to protect your own data structures as well. */ /* * If all of the entries in the array are NULL, @xa_head is a NULL pointer. * If the only non-NULL entry in the array is at index 0, @xa_head is that * entry. If any other entry in the array is non-NULL, @xa_head points * to an @xa_node. */ struct xarray { spinlock_t xa_lock; /* private: The rest of the data structure is not to be used directly. */ gfp_t xa_flags; void __rcu * xa_head; }; #define XARRAY_INIT(name, flags) { \ .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ .xa_flags = flags, \ .xa_head = NULL, \ } /** * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags. * @name: A string that names your XArray. * @flags: XA_FLAG values. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name and flags. It is * equivalent to calling xa_init_flags() on the array, but it does the * initialisation at compiletime instead of runtime. */ #define DEFINE_XARRAY_FLAGS(name, flags) \ struct xarray name = XARRAY_INIT(name, flags) /** * DEFINE_XARRAY() - Define an XArray. * @name: A string that names your XArray. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name. It is equivalent * to calling xa_init() on the array, but it does the initialisation at * compiletime instead of runtime. */ #define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0) /** * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC) /** * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1) void *xa_load(struct xarray *, unsigned long index); void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *xa_erase(struct xarray *, unsigned long index); void *xa_store_range(struct xarray *, unsigned long first, unsigned long last, void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); void *xa_find(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); void *xa_find_after(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); unsigned int xa_extract(struct xarray *, void **dst, unsigned long start, unsigned long max, unsigned int n, xa_mark_t); void xa_destroy(struct xarray *); /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. * @flags: XA_FLAG values. * * If you need to initialise an XArray with special flags (eg you need * to take the lock from interrupt context), use this function instead * of xa_init(). * * Context: Any context. */ static inline void xa_init_flags(struct xarray *xa, gfp_t flags) { spin_lock_init(&xa->xa_lock); xa->xa_flags = flags; xa->xa_head = NULL; } /** * xa_init() - Initialise an empty XArray. * @xa: XArray. * * An empty XArray is full of NULL entries. * * Context: Any context. */ static inline void xa_init(struct xarray *xa) { xa_init_flags(xa, 0); } /** * xa_empty() - Determine if an array has any present entries. * @xa: XArray. * * Context: Any context. * Return: %true if the array contains only NULL pointers. */ static inline bool xa_empty(const struct xarray *xa) { return xa->xa_head == NULL; } /** * xa_marked() - Inquire whether any entry in this array has a mark set * @xa: Array * @mark: Mark value * * Context: Any context. * Return: %true if any entry has this mark set. */ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) { return xa->xa_flags & XA_FLAGS_MARK(mark); } /** * xa_for_each_range() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * @last: Last index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_range() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_range(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_range(xa, index, entry, start, last) \ for (index = start, \ entry = xa_find(xa, &index, last, XA_PRESENT); \ entry; \ entry = xa_find_after(xa, &index, last, XA_PRESENT)) /** * xa_for_each_start() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_start() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_start(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_start(xa, index, entry, start) \ xa_for_each_range(xa, index, entry, start, ULONG_MAX) /** * xa_for_each() - Iterate over present entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you want * to skip or reprocess indices. It is safe to modify the array during the * iteration. At the end of the iteration, @entry will be set to NULL and * @index will have a value less than or equal to max. * * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). xa_for_each() * will spin if it hits a retry entry; if you intend to see retry entries, * you should use the xas_for_each() iterator instead. The xas_for_each() * iterator will expand into more inline code than xa_for_each(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each(xa, index, entry) \ xa_for_each_start(xa, index, entry, 0) /** * xa_for_each_marked() - Iterate over marked entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @filter: Selection criterion. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. The iteration will skip all entries in the array * which do not match @filter. You may modify @index during the iteration * if you want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set to * NULL and @index will have a value less than or equal to max. * * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n). * You have to handle your own locking with xas_for_each(), and if you have * to unlock after each iteration, it will also end up being O(n.log(n)). * xa_for_each_marked() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each_marked() iterator * instead. The xas_for_each_marked() iterator will expand into more inline * code than xa_for_each_marked(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \ entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter)) #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) #define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock) #define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock) #define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock) #define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock) #define xa_lock_irqsave(xa, flags) \ spin_lock_irqsave(&(xa)->xa_lock, flags) #define xa_unlock_irqrestore(xa, flags) \ spin_unlock_irqrestore(&(xa)->xa_lock, flags) #define xa_lock_nested(xa, subclass) \ spin_lock_nested(&(xa)->xa_lock, subclass) #define xa_lock_bh_nested(xa, subclass) \ spin_lock_bh_nested(&(xa)->xa_lock, subclass) #define xa_lock_irq_nested(xa, subclass) \ spin_lock_irq_nested(&(xa)->xa_lock, subclass) #define xa_lock_irqsave_nested(xa, flags, subclass) \ spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass) /* * Versions of the normal API which require the caller to hold the * xa_lock. If the GFP flags allow it, they will drop the lock to * allocate memory, then reacquire it afterwards. These functions * may also re-enable interrupts if the XArray flags indicate the * locking should be interrupt safe. */ void *__xa_erase(struct xarray *, unsigned long index); void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old, void *entry, gfp_t); int __must_check __xa_insert(struct xarray *, unsigned long index, void *entry, gfp_t); int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry, struct xa_limit, gfp_t); int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry, struct xa_limit, u32 *next, gfp_t); void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); /** * xa_store_bh() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_bh(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_store_irq() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_irq(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_erase_bh() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The entry which used to be at this index. */ static inline void *xa_erase_bh(struct xarray *xa, unsigned long index) { void *entry; xa_lock_bh(xa); entry = __xa_erase(xa, index); xa_unlock_bh(xa); return entry; } /** * xa_erase_irq() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The entry which used to be at this index. */ static inline void *xa_erase_irq(struct xarray *xa, unsigned long index) { void *entry; xa_lock_irq(xa); entry = __xa_erase(xa, index); xa_unlock_irq(xa); return entry; } /** * xa_cmpxchg() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * If the entry at @index is the same as @old, replace it with @entry. * If the return value is equal to @old, then the exchange was successful. * * Context: Any context. Takes and releases the xa_lock. May sleep * if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock(xa); return curr; } /** * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_bh(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_irq(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_insert() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock(xa); return err; } /** * xa_insert_bh() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_bh(xa); return err; } /** * xa_insert_irq() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline __must_check int xa_alloc(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock(xa); return err; } /** * xa_alloc_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock(xa); return err; } /** * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_irq(xa); return err; } /** * xa_reserve() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * Ensures there is somewhere to store an entry at @index in the array. * If there is already something stored at @index, this function does * nothing. If there was nothing there, the entry is marked as reserved. * Loading from a reserved entry returns a %NULL pointer. * * If you do not use the entry that you have reserved, call xa_release() * or xa_erase() to free any unnecessary memory. * * Context: Any context. Takes and releases the xa_lock. * May sleep if the @gfp flags permit. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_bh() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * A softirq-disabling version of xa_reserve(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_irq() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * An interrupt-disabling version of xa_reserve(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_release() - Release a reserved entry. * @xa: XArray. * @index: Index of entry. * * After calling xa_reserve(), you can call this function to release the * reservation. If the entry at @index has been stored to, this function * will do nothing. */ static inline void xa_release(struct xarray *xa, unsigned long index) { xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0); } /* Everything below here is the Advanced API. Proceed with caution. */ /* * The xarray is constructed out of a set of 'chunks' of pointers. Choosing * the best chunk size requires some tradeoffs. A power of two recommends * itself so that we can walk the tree based purely on shifts and masks. * Generally, the larger the better; as the number of slots per level of the * tree increases, the less tall the tree needs to be. But that needs to be * balanced against the memory consumption of each node. On a 64-bit system, * xa_node is currently 576 bytes, and we get 7 of them per 4kB page. If we * doubled the number of slots per node, we'd get only 3 nodes per 4kB page. */ #ifndef XA_CHUNK_SHIFT #define XA_CHUNK_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) #endif #define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT) #define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1) #define XA_MAX_MARKS 3 #define XA_MARK_LONGS DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG) /* * @count is the count of every non-NULL element in the ->slots array * whether that is a value entry, a retry entry, a user pointer, * a sibling entry or a pointer to the next level of the tree. * @nr_values is the count of every element in ->slots which is * either a value entry or a sibling of a value entry. */ struct xa_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ unsigned char count; /* Total entry count */ unsigned char nr_values; /* Value entry count */ struct xa_node __rcu *parent; /* NULL at top of tree */ struct xarray *array; /* The array we belong to */ union { struct list_head private_list; /* For tree user */ struct rcu_head rcu_head; /* Used when freeing node */ }; void __rcu *slots[XA_CHUNK_SIZE]; union { unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS]; unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS]; }; }; void xa_dump(const struct xarray *); void xa_dump_node(const struct xa_node *); #ifdef XA_DEBUG #define XA_BUG_ON(xa, x) do { \ if (x) { \ xa_dump(xa); \ BUG(); \ } \ } while (0) #define XA_NODE_BUG_ON(node, x) do { \ if (x) { \ if (node) xa_dump_node(node); \ BUG(); \ } \ } while (0) #else #define XA_BUG_ON(xa, x) do { } while (0) #define XA_NODE_BUG_ON(node, x) do { } while (0) #endif /* Private */ static inline void *xa_head(const struct xarray *xa) { return rcu_dereference_check(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_head_locked(const struct xarray *xa) { return rcu_dereference_protected(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_check(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry_locked(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_protected(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_check(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent_locked(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_protected(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_mk_node(const struct xa_node *node) { return (void *)((unsigned long)node | 2); } /* Private */ static inline struct xa_node *xa_to_node(const void *entry) { return (struct xa_node *)((unsigned long)entry - 2); } /* Private */ static inline bool xa_is_node(const void *entry) { return xa_is_internal(entry) && (unsigned long)entry > 4096; } /* Private */ static inline void *xa_mk_sibling(unsigned int offset) { return xa_mk_internal(offset); } /* Private */ static inline unsigned long xa_to_sibling(const void *entry) { return xa_to_internal(entry); } /** * xa_is_sibling() - Is the entry a sibling entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a sibling entry. */ static inline bool xa_is_sibling(const void *entry) { return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) && (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1)); } #define XA_RETRY_ENTRY xa_mk_internal(256) /** * xa_is_retry() - Is the entry a retry entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a retry entry. */ static inline bool xa_is_retry(const void *entry) { return unlikely(entry == XA_RETRY_ENTRY); } /** * xa_is_advanced() - Is the entry only permitted for the advanced API? * @entry: Entry to be stored in the XArray. * * Return: %true if the entry cannot be stored by the normal API. */ static inline bool xa_is_advanced(const void *entry) { return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY); } /** * typedef xa_update_node_t - A callback function from the XArray. * @node: The node which is being processed * * This function is called every time the XArray updates the count of * present and value entries in a node. It allows advanced users to * maintain the private_list in the node. * * Context: The xa_lock is held and interrupts may be disabled. * Implementations should not drop the xa_lock, nor re-enable * interrupts. */ typedef void (*xa_update_node_t)(struct xa_node *node); void xa_delete_node(struct xa_node *, xa_update_node_t); /* * The xa_state is opaque to its users. It contains various different pieces * of state involved in the current operation on the XArray. It should be * declared on the stack and passed between the various internal routines. * The various elements in it should not be accessed directly, but only * through the provided accessor functions. The below documentation is for * the benefit of those working on the code, not for users of the XArray. * * @xa_node usually points to the xa_node containing the slot we're operating * on (and @xa_offset is the offset in the slots array). If there is a * single entry in the array at index 0, there are no allocated xa_nodes to * point to, and so we store %NULL in @xa_node. @xa_node is set to * the value %XAS_RESTART if the xa_state is not walked to the correct * position in the tree of nodes for this operation. If an error occurs * during an operation, it is set to an %XAS_ERROR value. If we run off the * end of the allocated nodes, it is set to %XAS_BOUNDS. */ struct xa_state { struct xarray *xa; unsigned long xa_index; unsigned char xa_shift; unsigned char xa_sibs; unsigned char xa_offset; unsigned char xa_pad; /* Helps gcc generate better code */ struct xa_node *xa_node; struct xa_node *xa_alloc; xa_update_node_t xa_update; struct list_lru *xa_lru; }; /* * We encode errnos in the xas->xa_node. If an error has happened, we need to * drop the lock to fix it, and once we've done so the xa_state is invalid. */ #define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL)) #define XAS_BOUNDS ((struct xa_node *)1UL) #define XAS_RESTART ((struct xa_node *)3UL) #define __XA_STATE(array, index, shift, sibs) { \ .xa = array, \ .xa_index = index, \ .xa_shift = shift, \ .xa_sibs = sibs, \ .xa_offset = 0, \ .xa_pad = 0, \ .xa_node = XAS_RESTART, \ .xa_alloc = NULL, \ .xa_update = NULL, \ .xa_lru = NULL, \ } /** * XA_STATE() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * * Declare and initialise an xa_state on the stack. */ #define XA_STATE(name, array, index) \ struct xa_state name = __XA_STATE(array, index, 0, 0) /** * XA_STATE_ORDER() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * @order: Order of entry. * * Declare and initialise an xa_state on the stack. This variant of * XA_STATE() allows you to specify the 'order' of the element you * want to operate on.` */ #define XA_STATE_ORDER(name, array, index, order) \ struct xa_state name = __XA_STATE(array, \ (index >> order) << order, \ order - (order % XA_CHUNK_SHIFT), \ (1U << (order % XA_CHUNK_SHIFT)) - 1) #define xas_marked(xas, mark) xa_marked((xas)->xa, (mark)) #define xas_trylock(xas) xa_trylock((xas)->xa) #define xas_lock(xas) xa_lock((xas)->xa) #define xas_unlock(xas) xa_unlock((xas)->xa) #define xas_lock_bh(xas) xa_lock_bh((xas)->xa) #define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa) #define xas_lock_irq(xas) xa_lock_irq((xas)->xa) #define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa) #define xas_lock_irqsave(xas, flags) \ xa_lock_irqsave((xas)->xa, flags) #define xas_unlock_irqrestore(xas, flags) \ xa_unlock_irqrestore((xas)->xa, flags) /** * xas_error() - Return an errno stored in the xa_state. * @xas: XArray operation state. * * Return: 0 if no error has been noted. A negative errno if one has. */ static inline int xas_error(const struct xa_state *xas) { return xa_err(xas->xa_node); } /** * xas_set_err() - Note an error in the xa_state. * @xas: XArray operation state. * @err: Negative error number. * * Only call this function with a negative @err; zero or positive errors * will probably not behave the way you think they should. If you want * to clear the error from an xa_state, use xas_reset(). */ static inline void xas_set_err(struct xa_state *xas, long err) { xas->xa_node = XA_ERROR(err); } /** * xas_invalid() - Is the xas in a retry or error state? * @xas: XArray operation state. * * Return: %true if the xas cannot be used for operations. */ static inline bool xas_invalid(const struct xa_state *xas) { return (unsigned long)xas->xa_node & 3; } /** * xas_valid() - Is the xas a valid cursor into the array? * @xas: XArray operation state. * * Return: %true if the xas can be used for operations. */ static inline bool xas_valid(const struct xa_state *xas) { return !xas_invalid(xas); } /** * xas_is_node() - Does the xas point to a node? * @xas: XArray operation state. * * Return: %true if the xas currently references a node. */ static inline bool xas_is_node(const struct xa_state *xas) { return xas_valid(xas) && xas->xa_node; } /* True if the pointer is something other than a node */ static inline bool xas_not_node(struct xa_node *node) { return ((unsigned long)node & 3) || !node; } /* True if the node represents RESTART or an error */ static inline bool xas_frozen(struct xa_node *node) { return (unsigned long)node & 2; } /* True if the node represents head-of-tree, RESTART or BOUNDS */ static inline bool xas_top(struct xa_node *node) { return node <= XAS_RESTART; } /** * xas_reset() - Reset an XArray operation state. * @xas: XArray operation state. * * Resets the error or walk state of the @xas so future walks of the * array will start from the root. Use this if you have dropped the * xarray lock and want to reuse the xa_state. * * Context: Any context. */ static inline void xas_reset(struct xa_state *xas) { xas->xa_node = XAS_RESTART; } /** * xas_retry() - Retry the operation if appropriate. * @xas: XArray operation state. * @entry: Entry from xarray. * * The advanced functions may sometimes return an internal entry, such as * a retry entry or a zero entry. This function sets up the @xas to restart * the walk from the head of the array if needed. * * Context: Any context. * Return: true if the operation needs to be retried. */ static inline bool xas_retry(struct xa_state *xas, const void *entry) { if (xa_is_zero(entry)) return true; if (!xa_is_retry(entry)) return false; xas_reset(xas); return true; } void *xas_load(struct xa_state *); void *xas_store(struct xa_state *, void *entry); void *xas_find(struct xa_state *, unsigned long max); void *xas_find_conflict(struct xa_state *); bool xas_get_mark(const struct xa_state *, xa_mark_t); void xas_set_mark(const struct xa_state *, xa_mark_t); void xas_clear_mark(const struct xa_state *, xa_mark_t); void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t); void xas_init_marks(const struct xa_state *); bool xas_nomem(struct xa_state *, gfp_t); void xas_destroy(struct xa_state *); void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); #ifdef CONFIG_XARRAY_MULTI int xa_get_order(struct xarray *, unsigned long index); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { return 0; } static inline void xas_split(struct xa_state *xas, void *entry, unsigned int order) { xas_store(xas, entry); } static inline void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { } #endif /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. * * Use this function to check that a previously loaded entry still has * the same value. This is useful for the lockless pagecache lookup where * we walk the array with only the RCU lock to protect us, lock the page, * then check that the page hasn't moved since we looked it up. * * The caller guarantees that @xas is still valid. If it may be in an * error or restart state, call xas_load() instead. * * Return: The entry at this location in the xarray. */ static inline void *xas_reload(struct xa_state *xas) { struct xa_node *node = xas->xa_node; void *entry; char offset; if (!node) return xa_head(xas->xa); if (IS_ENABLED(CONFIG_XARRAY_MULTI)) { offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK; entry = xa_entry(xas->xa, node, offset); if (!xa_is_sibling(entry)) return entry; offset = xa_to_sibling(entry); } else { offset = xas->xa_offset; } return xa_entry(xas->xa, node, offset); } /** * xas_set() - Set up XArray operation state for a different index. * @xas: XArray operation state. * @index: New index into the XArray. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see xas_next() * to move to an adjacent index. */ static inline void xas_set(struct xa_state *xas, unsigned long index) { xas->xa_index = index; xas->xa_node = XAS_RESTART; } /** * xas_advance() - Skip over sibling entries. * @xas: XArray operation state. * @index: Index of last sibling entry. * * Move the operation state to refer to the last sibling entry. * This is useful for loops that normally want to see sibling * entries but sometimes want to skip them. Use xas_set() if you * want to move to an index which is not part of this entry. */ static inline void xas_advance(struct xa_state *xas, unsigned long index) { unsigned char shift = xas_is_node(xas) ? xas->xa_node->shift : 0; xas->xa_index = index; xas->xa_offset = (index >> shift) & XA_CHUNK_MASK; } /** * xas_set_order() - Set up XArray operation state for a multislot entry. * @xas: XArray operation state. * @index: Target of the operation. * @order: Entry occupies 2^@order indices. */ static inline void xas_set_order(struct xa_state *xas, unsigned long index, unsigned int order) { #ifdef CONFIG_XARRAY_MULTI xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0; xas->xa_shift = order - (order % XA_CHUNK_SHIFT); xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; xas->xa_node = XAS_RESTART; #else BUG_ON(order > 0); xas_set(xas, index); #endif } /** * xas_set_update() - Set up XArray operation state for a callback. * @xas: XArray operation state. * @update: Function to call when updating a node. * * The XArray can notify a caller after it has updated an xa_node. * This is advanced functionality and is only needed by the page * cache and swap cache. */ static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) { xas->xa_update = update; } static inline void xas_set_lru(struct xa_state *xas, struct list_lru *lru) { xas->xa_lru = lru; } /** * xas_next_entry() - Advance iterator to next present entry. * @xas: XArray operation state. * @max: Highest index to return. * * xas_next_entry() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find(), and will call xas_find() * for all the hard cases. * * Return: The next present entry after the one currently referred to by @xas. */ static inline void *xas_next_entry(struct xa_state *xas, unsigned long max) { struct xa_node *node = xas->xa_node; void *entry; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK))) return xas_find(xas, max); do { if (unlikely(xas->xa_index >= max)) return xas_find(xas, max); if (unlikely(xas->xa_offset == XA_CHUNK_MASK)) return xas_find(xas, max); entry = xa_entry(xas->xa, node, xas->xa_offset + 1); if (unlikely(xa_is_internal(entry))) return xas_find(xas, max); xas->xa_offset++; xas->xa_index++; } while (!entry); return entry; } /* Private */ static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance, xa_mark_t mark) { unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark]; unsigned int offset = xas->xa_offset; if (advance) offset++; if (XA_CHUNK_SIZE == BITS_PER_LONG) { if (offset < XA_CHUNK_SIZE) { unsigned long data = *addr & (~0UL << offset); if (data) return __ffs(data); } return XA_CHUNK_SIZE; } return find_next_bit(addr, XA_CHUNK_SIZE, offset); } /** * xas_next_marked() - Advance iterator to next marked entry. * @xas: XArray operation state. * @max: Highest index to return. * @mark: Mark to search for. * * xas_next_marked() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find_marked(), and will call * xas_find_marked() for all the hard cases. * * Return: The next marked entry after the one currently referred to by @xas. */ static inline void *xas_next_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) { struct xa_node *node = xas->xa_node; void *entry; unsigned int offset; if (unlikely(xas_not_node(node) || node->shift)) return xas_find_marked(xas, max, mark); offset = xas_find_chunk(xas, true, mark); xas->xa_offset = offset; xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset; if (xas->xa_index > max) return NULL; if (offset == XA_CHUNK_SIZE) return xas_find_marked(xas, max, mark); entry = xa_entry(xas->xa, node, offset); if (!entry) return xas_find_marked(xas, max, mark); return entry; } /* * If iterating while holding a lock, drop the lock and reschedule * every %XA_CHECK_SCHED loops. */ enum { XA_CHECK_SCHED = 4096, }; /** * xas_for_each() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * * The loop body will be executed for each entry present in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each(xas, entry, max) \ for (entry = xas_find(xas, max); entry; \ entry = xas_next_entry(xas, max)) /** * xas_for_each_marked() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * @mark: Mark to search for. * * The loop body will be executed for each marked entry in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each_marked(xas, entry, max, mark) \ for (entry = xas_find_marked(xas, max, mark); entry; \ entry = xas_next_marked(xas, max, mark)) /** * xas_for_each_conflict() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * * The loop body will be executed for each entry in the XArray that * lies within the range specified by @xas. If the loop terminates * normally, @entry will be %NULL. The user may break out of the loop, * which will leave @entry set to the conflicting entry. The caller * may also call xa_set_err() to exit the loop while setting an error * to record the reason. */ #define xas_for_each_conflict(xas, entry) \ while ((entry = xas_find_conflict(xas))) void *__xas_next(struct xa_state *); void *__xas_prev(struct xa_state *); /** * xas_prev() - Move iterator to previous index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * subtracted from the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index 0, this function wraps * around to %ULONG_MAX. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_prev(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == 0)) return __xas_prev(xas); xas->xa_index--; xas->xa_offset--; return xa_entry(xas->xa, node, xas->xa_offset); } /** * xas_next() - Move state to next index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * added to the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index %ULONG_MAX, this function wraps * around to 0. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_next(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == XA_CHUNK_MASK)) return __xas_next(xas); xas->xa_index++; xas->xa_offset++; return xa_entry(xas->xa, node, xas->xa_offset); } #endif /* _LINUX_XARRAY_H */
1995 5 544 9 199 2112 52 2090 2092 1709 887 2092 40 2067 578 4 572 574 569 6 564 8 8 57 57 208 208 9 208 208 207 198 199 199 200 8 57 9 9 9 190 23 168 1880 5 545 1880 12314 12138 1880 1880 1880 1879 1880 1879 1995 52 1998 1999 1998 52 1995 545 546 547 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. * Authors: David Chinner and Glauber Costa * * Generic LRU infrastructure */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/list_lru.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/memcontrol.h> #include "slab.h" #include "internal.h" #ifdef CONFIG_MEMCG_KMEM static LIST_HEAD(memcg_list_lrus); static DEFINE_MUTEX(list_lrus_mutex); static inline bool list_lru_memcg_aware(struct list_lru *lru) { return lru->memcg_aware; } static void list_lru_register(struct list_lru *lru) { if (!list_lru_memcg_aware(lru)) return; mutex_lock(&list_lrus_mutex); list_add(&lru->list, &memcg_list_lrus); mutex_unlock(&list_lrus_mutex); } static void list_lru_unregister(struct list_lru *lru) { if (!list_lru_memcg_aware(lru)) return; mutex_lock(&list_lrus_mutex); list_del(&lru->list); mutex_unlock(&list_lrus_mutex); } static int lru_shrinker_id(struct list_lru *lru) { return lru->shrinker_id; } static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { if (list_lru_memcg_aware(lru) && idx >= 0) { struct list_lru_memcg *mlru = xa_load(&lru->xa, idx); return mlru ? &mlru->node[nid] : NULL; } return &lru->node[nid].lru; } #else static void list_lru_register(struct list_lru *lru) { } static void list_lru_unregister(struct list_lru *lru) { } static int lru_shrinker_id(struct list_lru *lru) { return -1; } static inline bool list_lru_memcg_aware(struct list_lru *lru) { return false; } static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { return &lru->node[nid].lru; } #endif /* CONFIG_MEMCG_KMEM */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; spin_lock(&nlru->lock); if (list_empty(item)) { l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); nlru->nr_items++; spin_unlock(&nlru->lock); return true; } spin_unlock(&nlru->lock); return false; } EXPORT_SYMBOL_GPL(list_lru_add); bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ? mem_cgroup_from_slab_obj(item) : NULL; return list_lru_add(lru, item, nid, memcg); } EXPORT_SYMBOL_GPL(list_lru_add_obj); bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; spin_lock(&nlru->lock); if (!list_empty(item)) { l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); list_del_init(item); l->nr_items--; nlru->nr_items--; spin_unlock(&nlru->lock); return true; } spin_unlock(&nlru->lock); return false; } EXPORT_SYMBOL_GPL(list_lru_del); bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ? mem_cgroup_from_slab_obj(item) : NULL; return list_lru_del(lru, item, nid, memcg); } EXPORT_SYMBOL_GPL(list_lru_del_obj); void list_lru_isolate(struct list_lru_one *list, struct list_head *item) { list_del_init(item); list->nr_items--; } EXPORT_SYMBOL_GPL(list_lru_isolate); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head) { list_move(item, head); list->nr_items--; } EXPORT_SYMBOL_GPL(list_lru_isolate_move); void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { struct list_lru_one *list = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); if (list_empty(item)) { list_add_tail(item, &list->list); if (!list->nr_items++) set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); } } EXPORT_SYMBOL_GPL(list_lru_putback); unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg) { struct list_lru_one *l; long count; rcu_read_lock(); l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); count = l ? READ_ONCE(l->nr_items) : 0; rcu_read_unlock(); if (unlikely(count < 0)) count = 0; return count; } EXPORT_SYMBOL_GPL(list_lru_count_one); unsigned long list_lru_count_node(struct list_lru *lru, int nid) { struct list_lru_node *nlru; nlru = &lru->node[nid]; return nlru->nr_items; } EXPORT_SYMBOL_GPL(list_lru_count_node); static unsigned long __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; struct list_head *item, *n; unsigned long isolated = 0; restart: l = list_lru_from_memcg_idx(lru, nid, memcg_idx); if (!l) goto out; list_for_each_safe(item, n, &l->list) { enum lru_status ret; /* * decrement nr_to_walk first so that we don't livelock if we * get stuck on large numbers of LRU_RETRY items */ if (!*nr_to_walk) break; --*nr_to_walk; ret = isolate(item, l, &nlru->lock, cb_arg); switch (ret) { case LRU_REMOVED_RETRY: assert_spin_locked(&nlru->lock); fallthrough; case LRU_REMOVED: isolated++; nlru->nr_items--; /* * If the lru lock has been dropped, our list * traversal is now invalid and so we have to * restart from scratch. */ if (ret == LRU_REMOVED_RETRY) goto restart; break; case LRU_ROTATE: list_move_tail(item, &l->list); break; case LRU_SKIP: break; case LRU_RETRY: /* * The lru lock has been dropped, our list traversal is * now invalid and so we have to restart from scratch. */ assert_spin_locked(&nlru->lock); goto restart; default: BUG(); } } out: return isolated; } unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { struct list_lru_node *nlru = &lru->node[nid]; unsigned long ret; spin_lock(&nlru->lock); ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, cb_arg, nr_to_walk); spin_unlock(&nlru->lock); return ret; } EXPORT_SYMBOL_GPL(list_lru_walk_one); unsigned long list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { struct list_lru_node *nlru = &lru->node[nid]; unsigned long ret; spin_lock_irq(&nlru->lock); ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, cb_arg, nr_to_walk); spin_unlock_irq(&nlru->lock); return ret; } unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { long isolated = 0; isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, nr_to_walk); #ifdef CONFIG_MEMCG_KMEM if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { struct list_lru_memcg *mlru; unsigned long index; xa_for_each(&lru->xa, index, mlru) { struct list_lru_node *nlru = &lru->node[nid]; spin_lock(&nlru->lock); isolated += __list_lru_walk_one(lru, nid, index, isolate, cb_arg, nr_to_walk); spin_unlock(&nlru->lock); if (*nr_to_walk <= 0) break; } } #endif return isolated; } EXPORT_SYMBOL_GPL(list_lru_walk_node); static void init_one_lru(struct list_lru_one *l) { INIT_LIST_HEAD(&l->list); l->nr_items = 0; } #ifdef CONFIG_MEMCG_KMEM static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) { int nid; struct list_lru_memcg *mlru; mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp); if (!mlru) return NULL; for_each_node(nid) init_one_lru(&mlru->node[nid]); return mlru; } static void memcg_list_lru_free(struct list_lru *lru, int src_idx) { struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx); /* * The __list_lru_walk_one() can walk the list of this node. * We need kvfree_rcu() here. And the walking of the list * is under lru->node[nid]->lock, which can serve as a RCU * read-side critical section. */ if (mlru) kvfree_rcu(mlru, rcu); } static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { if (memcg_aware) xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ); lru->memcg_aware = memcg_aware; } static void memcg_destroy_list_lru(struct list_lru *lru) { XA_STATE(xas, &lru->xa, 0); struct list_lru_memcg *mlru; if (!list_lru_memcg_aware(lru)) return; xas_lock_irq(&xas); xas_for_each(&xas, mlru, ULONG_MAX) { kfree(mlru); xas_store(&xas, NULL); } xas_unlock_irq(&xas); } static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, int src_idx, struct mem_cgroup *dst_memcg) { struct list_lru_node *nlru = &lru->node[nid]; int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *src, *dst; /* * Since list_lru_{add,del} may be called under an IRQ-safe lock, * we have to use IRQ-safe primitives here to avoid deadlock. */ spin_lock_irq(&nlru->lock); src = list_lru_from_memcg_idx(lru, nid, src_idx); if (!src) goto out; dst = list_lru_from_memcg_idx(lru, nid, dst_idx); list_splice_init(&src->list, &dst->list); if (src->nr_items) { dst->nr_items += src->nr_items; set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); src->nr_items = 0; } out: spin_unlock_irq(&nlru->lock); } static void memcg_reparent_list_lru(struct list_lru *lru, int src_idx, struct mem_cgroup *dst_memcg) { int i; for_each_node(i) memcg_reparent_list_lru_node(lru, i, src_idx, dst_memcg); memcg_list_lru_free(lru, src_idx); } void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) { struct cgroup_subsys_state *css; struct list_lru *lru; int src_idx = memcg->kmemcg_id; /* * Change kmemcg_id of this cgroup and all its descendants to the * parent's id, and then move all entries from this cgroup's list_lrus * to ones of the parent. * * After we have finished, all list_lrus corresponding to this cgroup * are guaranteed to remain empty. So we can safely free this cgroup's * list lrus in memcg_list_lru_free(). * * Changing ->kmemcg_id to the parent can prevent memcg_list_lru_alloc() * from allocating list lrus for this cgroup after memcg_list_lru_free() * call. */ rcu_read_lock(); css_for_each_descendant_pre(css, &memcg->css) { struct mem_cgroup *child; child = mem_cgroup_from_css(css); WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id); } rcu_read_unlock(); mutex_lock(&list_lrus_mutex); list_for_each_entry(lru, &memcg_list_lrus, list) memcg_reparent_list_lru(lru, src_idx, parent); mutex_unlock(&list_lrus_mutex); } static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, struct list_lru *lru) { int idx = memcg->kmemcg_id; return idx < 0 || xa_load(&lru->xa, idx); } int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp) { int i; unsigned long flags; struct list_lru_memcg_table { struct list_lru_memcg *mlru; struct mem_cgroup *memcg; } *table; XA_STATE(xas, &lru->xa, 0); if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) return 0; gfp &= GFP_RECLAIM_MASK; table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp); if (!table) return -ENOMEM; /* * Because the list_lru can be reparented to the parent cgroup's * list_lru, we should make sure that this cgroup and all its * ancestors have allocated list_lru_memcg. */ for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) { if (memcg_list_lru_allocated(memcg, lru)) break; table[i].memcg = memcg; table[i].mlru = memcg_init_list_lru_one(gfp); if (!table[i].mlru) { while (i--) kfree(table[i].mlru); kfree(table); return -ENOMEM; } } xas_lock_irqsave(&xas, flags); while (i--) { int index = READ_ONCE(table[i].memcg->kmemcg_id); struct list_lru_memcg *mlru = table[i].mlru; xas_set(&xas, index); retry: if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) { kfree(mlru); } else { xas_store(&xas, mlru); if (xas_error(&xas) == -ENOMEM) { xas_unlock_irqrestore(&xas, flags); if (xas_nomem(&xas, gfp)) xas_set_err(&xas, 0); xas_lock_irqsave(&xas, flags); /* * The xas lock has been released, this memcg * can be reparented before us. So reload * memcg id. More details see the comments * in memcg_reparent_list_lrus(). */ index = READ_ONCE(table[i].memcg->kmemcg_id); if (index < 0) xas_set_err(&xas, 0); else if (!xas_error(&xas) && index != xas.xa_index) xas_set(&xas, index); goto retry; } } } /* xas_nomem() is used to free memory instead of memory allocation. */ if (xas.xa_alloc) xas_nomem(&xas, gfp); xas_unlock_irqrestore(&xas, flags); kfree(table); return xas_error(&xas); } #else static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { } static void memcg_destroy_list_lru(struct list_lru *lru) { } #endif /* CONFIG_MEMCG_KMEM */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key, struct shrinker *shrinker) { int i; #ifdef CONFIG_MEMCG_KMEM if (shrinker) lru->shrinker_id = shrinker->id; else lru->shrinker_id = -1; #endif lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); if (!lru->node) return -ENOMEM; for_each_node(i) { spin_lock_init(&lru->node[i].lock); if (key) lockdep_set_class(&lru->node[i].lock, key); init_one_lru(&lru->node[i].lru); } memcg_init_list_lru(lru, memcg_aware); list_lru_register(lru); return 0; } EXPORT_SYMBOL_GPL(__list_lru_init); void list_lru_destroy(struct list_lru *lru) { /* Already destroyed or not yet initialized? */ if (!lru->node) return; list_lru_unregister(lru); memcg_destroy_list_lru(lru); kfree(lru->node); lru->node = NULL; #ifdef CONFIG_MEMCG_KMEM lru->shrinker_id = -1; #endif } EXPORT_SYMBOL_GPL(list_lru_destroy);
1 1 1 2 2 2 1 1 12 2 2 1 1 1 1 3 2 1 1 12 6 1 1 1 1 1 1 4 1 3 2 1 2 1 1 1 104 4 3 154 152 178 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 // SPDX-License-Identifier: GPL-2.0 /* * Copyright © 2019 Oracle and/or its affiliates. All rights reserved. * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. * * KVM Xen emulation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "x86.h" #include "xen.h" #include "hyperv.h" #include "lapic.h" #include <linux/eventfd.h> #include <linux/kvm_host.h> #include <linux/sched/stat.h> #include <trace/events/kvm.h> #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> #include <xen/interface/version.h> #include <xen/interface/event_channel.h> #include <xen/interface/sched.h> #include <asm/xen/cpuid.h> #include "cpuid.h" #include "trace.h" static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm); static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data); static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r); DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ); static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) { struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; struct pvclock_wall_clock *wc; gpa_t gpa = gfn_to_gpa(gfn); u32 *wc_sec_hi; u32 wc_version; u64 wall_nsec; int ret = 0; int idx = srcu_read_lock(&kvm->srcu); if (gfn == KVM_XEN_INVALID_GFN) { kvm_gpc_deactivate(gpc); goto out; } do { ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE); if (ret) goto out; /* * This code mirrors kvm_write_wall_clock() except that it writes * directly through the pfn cache and doesn't mark the page dirty. */ wall_nsec = kvm_get_wall_clock_epoch(kvm); /* It could be invalid again already, so we need to check */ read_lock_irq(&gpc->lock); if (gpc->valid) break; read_unlock_irq(&gpc->lock); } while (1); /* Paranoia checks on the 32-bit struct layout */ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924); BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); #ifdef CONFIG_X86_64 /* Paranoia checks on the 64-bit struct layout */ BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00); BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c); if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { struct shared_info *shinfo = gpc->khva; wc_sec_hi = &shinfo->wc_sec_hi; wc = &shinfo->wc; } else #endif { struct compat_shared_info *shinfo = gpc->khva; wc_sec_hi = &shinfo->arch.wc_sec_hi; wc = &shinfo->wc; } /* Increment and ensure an odd value */ wc_version = wc->version = (wc->version + 1) | 1; smp_wmb(); wc->nsec = do_div(wall_nsec, NSEC_PER_SEC); wc->sec = (u32)wall_nsec; *wc_sec_hi = wall_nsec >> 32; smp_wmb(); wc->version = wc_version + 1; read_unlock_irq(&gpc->lock); kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE); out: srcu_read_unlock(&kvm->srcu, idx); return ret; } void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu) { if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) { struct kvm_xen_evtchn e; e.vcpu_id = vcpu->vcpu_id; e.vcpu_idx = vcpu->vcpu_idx; e.port = vcpu->arch.xen.timer_virq; e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; kvm_xen_set_evtchn(&e, vcpu->kvm); vcpu->arch.xen.timer_expires = 0; atomic_set(&vcpu->arch.xen.timer_pending, 0); } } static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) { struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu, arch.xen.timer); struct kvm_xen_evtchn e; int rc; if (atomic_read(&vcpu->arch.xen.timer_pending)) return HRTIMER_NORESTART; e.vcpu_id = vcpu->vcpu_id; e.vcpu_idx = vcpu->vcpu_idx; e.port = vcpu->arch.xen.timer_virq; e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm); if (rc != -EWOULDBLOCK) { vcpu->arch.xen.timer_expires = 0; return HRTIMER_NORESTART; } atomic_inc(&vcpu->arch.xen.timer_pending); kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); return HRTIMER_NORESTART; } static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns) { /* * Avoid races with the old timer firing. Checking timer_expires * to avoid calling hrtimer_cancel() will only have false positives * so is fine. */ if (vcpu->arch.xen.timer_expires) hrtimer_cancel(&vcpu->arch.xen.timer); atomic_set(&vcpu->arch.xen.timer_pending, 0); vcpu->arch.xen.timer_expires = guest_abs; if (delta_ns <= 0) { xen_timer_callback(&vcpu->arch.xen.timer); } else { ktime_t ktime_now = ktime_get(); hrtimer_start(&vcpu->arch.xen.timer, ktime_add_ns(ktime_now, delta_ns), HRTIMER_MODE_ABS_HARD); } } static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu) { hrtimer_cancel(&vcpu->arch.xen.timer); vcpu->arch.xen.timer_expires = 0; atomic_set(&vcpu->arch.xen.timer_pending, 0); } static void kvm_xen_init_timer(struct kvm_vcpu *vcpu) { hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); vcpu->arch.xen.timer.function = xen_timer_callback; } static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) { struct kvm_vcpu_xen *vx = &v->arch.xen; struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache; struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache; size_t user_len, user_len1, user_len2; struct vcpu_runstate_info rs; unsigned long flags; size_t times_ofs; uint8_t *update_bit = NULL; uint64_t entry_time; uint64_t *rs_times; int *rs_state; /* * The only difference between 32-bit and 64-bit versions of the * runstate struct is the alignment of uint64_t in 32-bit, which * means that the 64-bit version has an additional 4 bytes of * padding after the first field 'state'. Let's be really really * paranoid about that, and matching it with our internal data * structures that we memcpy into it... */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0); BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0); BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); #ifdef CONFIG_X86_64 /* * The 64-bit structure has 4 bytes of padding before 'state_entry_time' * so each subsequent field is shifted by 4, and it's 4 bytes longer. */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4); BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) != offsetof(struct compat_vcpu_runstate_info, time) + 4); BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4); #endif /* * The state field is in the same place at the start of both structs, * and is the same size (int) as vx->current_runstate. */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != offsetof(struct compat_vcpu_runstate_info, state)); BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) != sizeof(vx->current_runstate)); BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != sizeof(vx->current_runstate)); /* * The state_entry_time field is 64 bits in both versions, and the * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86 * is little-endian means that it's in the last *byte* of the word. * That detail is important later. */ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != sizeof(uint64_t)); BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != sizeof(uint64_t)); BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80); /* * The time array is four 64-bit quantities in both versions, matching * the vx->runstate_times and immediately following state_entry_time. */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t)); BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) != offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t)); BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof_field(struct compat_vcpu_runstate_info, time)); BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof(vx->runstate_times)); if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) { user_len = sizeof(struct vcpu_runstate_info); times_ofs = offsetof(struct vcpu_runstate_info, state_entry_time); } else { user_len = sizeof(struct compat_vcpu_runstate_info); times_ofs = offsetof(struct compat_vcpu_runstate_info, state_entry_time); } /* * There are basically no alignment constraints. The guest can set it * up so it crosses from one page to the next, and at arbitrary byte * alignment (and the 32-bit ABI doesn't align the 64-bit integers * anyway, even if the overall struct had been 64-bit aligned). */ if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) { user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK); user_len2 = user_len - user_len1; } else { user_len1 = user_len; user_len2 = 0; } BUG_ON(user_len1 + user_len2 != user_len); retry: /* * Attempt to obtain the GPC lock on *both* (if there are two) * gfn_to_pfn caches that cover the region. */ if (atomic) { local_irq_save(flags); if (!read_trylock(&gpc1->lock)) { local_irq_restore(flags); return; } } else { read_lock_irqsave(&gpc1->lock, flags); } while (!kvm_gpc_check(gpc1, user_len1)) { read_unlock_irqrestore(&gpc1->lock, flags); /* When invoked from kvm_sched_out() we cannot sleep */ if (atomic) return; if (kvm_gpc_refresh(gpc1, user_len1)) return; read_lock_irqsave(&gpc1->lock, flags); } if (likely(!user_len2)) { /* * Set up three pointers directly to the runstate_info * struct in the guest (via the GPC). * * • @rs_state → state field * • @rs_times → state_entry_time field. * • @update_bit → last byte of state_entry_time, which * contains the XEN_RUNSTATE_UPDATE bit. */ rs_state = gpc1->khva; rs_times = gpc1->khva + times_ofs; if (v->kvm->arch.xen.runstate_update_flag) update_bit = ((void *)(&rs_times[1])) - 1; } else { /* * The guest's runstate_info is split across two pages and we * need to hold and validate both GPCs simultaneously. We can * declare a lock ordering GPC1 > GPC2 because nothing else * takes them more than one at a time. Set a subclass on the * gpc1 lock to make lockdep shut up about it. */ lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_); if (atomic) { if (!read_trylock(&gpc2->lock)) { read_unlock_irqrestore(&gpc1->lock, flags); return; } } else { read_lock(&gpc2->lock); } if (!kvm_gpc_check(gpc2, user_len2)) { read_unlock(&gpc2->lock); read_unlock_irqrestore(&gpc1->lock, flags); /* When invoked from kvm_sched_out() we cannot sleep */ if (atomic) return; /* * Use kvm_gpc_activate() here because if the runstate * area was configured in 32-bit mode and only extends * to the second page now because the guest changed to * 64-bit mode, the second GPC won't have been set up. */ if (kvm_gpc_activate(gpc2, gpc1->gpa + user_len1, user_len2)) return; /* * We dropped the lock on GPC1 so we have to go all the * way back and revalidate that too. */ goto retry; } /* * In this case, the runstate_info struct will be assembled on * the kernel stack (compat or not as appropriate) and will * be copied to GPC1/GPC2 with a dual memcpy. Set up the three * rs pointers accordingly. */ rs_times = &rs.state_entry_time; /* * The rs_state pointer points to the start of what we'll * copy to the guest, which in the case of a compat guest * is the 32-bit field that the compiler thinks is padding. */ rs_state = ((void *)rs_times) - times_ofs; /* * The update_bit is still directly in the guest memory, * via one GPC or the other. */ if (v->kvm->arch.xen.runstate_update_flag) { if (user_len1 >= times_ofs + sizeof(uint64_t)) update_bit = gpc1->khva + times_ofs + sizeof(uint64_t) - 1; else update_bit = gpc2->khva + times_ofs + sizeof(uint64_t) - 1 - user_len1; } #ifdef CONFIG_X86_64 /* * Don't leak kernel memory through the padding in the 64-bit * version of the struct. */ memset(&rs, 0, offsetof(struct vcpu_runstate_info, state_entry_time)); #endif } /* * First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the * state_entry_time field, directly in the guest. We need to set * that (and write-barrier) before writing to the rest of the * structure, and clear it last. Just as Xen does, we address the * single *byte* in which it resides because it might be in a * different cache line to the rest of the 64-bit word, due to * the (lack of) alignment constraints. */ entry_time = vx->runstate_entry_time; if (update_bit) { entry_time |= XEN_RUNSTATE_UPDATE; *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56; smp_wmb(); } /* * Now assemble the actual structure, either on our kernel stack * or directly in the guest according to how the rs_state and * rs_times pointers were set up above. */ *rs_state = vx->current_runstate; rs_times[0] = entry_time; memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); /* For the split case, we have to then copy it to the guest. */ if (user_len2) { memcpy(gpc1->khva, rs_state, user_len1); memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2); } smp_wmb(); /* Finally, clear the XEN_RUNSTATE_UPDATE bit. */ if (update_bit) { entry_time &= ~XEN_RUNSTATE_UPDATE; *update_bit = entry_time >> 56; smp_wmb(); } if (user_len2) read_unlock(&gpc2->lock); read_unlock_irqrestore(&gpc1->lock, flags); mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT); if (user_len2) mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT); } void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) { struct kvm_vcpu_xen *vx = &v->arch.xen; u64 now = get_kvmclock_ns(v->kvm); u64 delta_ns = now - vx->runstate_entry_time; u64 run_delay = current->sched_info.run_delay; if (unlikely(!vx->runstate_entry_time)) vx->current_runstate = RUNSTATE_offline; /* * Time waiting for the scheduler isn't "stolen" if the * vCPU wasn't running anyway. */ if (vx->current_runstate == RUNSTATE_running) { u64 steal_ns = run_delay - vx->last_steal; delta_ns -= steal_ns; vx->runstate_times[RUNSTATE_runnable] += steal_ns; } vx->last_steal = run_delay; vx->runstate_times[vx->current_runstate] += delta_ns; vx->current_runstate = state; vx->runstate_entry_time = now; if (vx->runstate_cache.active) kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable); } static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) { struct kvm_lapic_irq irq = { }; int r; irq.dest_id = v->vcpu_id; irq.vector = v->arch.xen.upcall_vector; irq.dest_mode = APIC_DEST_PHYSICAL; irq.shorthand = APIC_DEST_NOSHORT; irq.delivery_mode = APIC_DM_FIXED; irq.level = 1; /* The fast version will always work for physical unicast */ WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL)); } /* * On event channel delivery, the vcpu_info may not have been accessible. * In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which * need to be marked into the vcpu_info (and evtchn_upcall_pending set). * Do so now that we can sleep in the context of the vCPU to bring the * page in, and refresh the pfn cache for it. */ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) { unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel); struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache; unsigned long flags; if (!evtchn_pending_sel) return; /* * Yes, this is an open-coded loop. But that's just what put_user() * does anyway. Page it in and retry the instruction. We're just a * little more honest about it. */ read_lock_irqsave(&gpc->lock, flags); while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { read_unlock_irqrestore(&gpc->lock, flags); if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) return; read_lock_irqsave(&gpc->lock, flags); } /* Now gpc->khva is a valid kernel address for the vcpu_info */ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) { struct vcpu_info *vi = gpc->khva; asm volatile(LOCK_PREFIX "orq %0, %1\n" "notq %0\n" LOCK_PREFIX "andq %0, %2\n" : "=r" (evtchn_pending_sel), "+m" (vi->evtchn_pending_sel), "+m" (v->arch.xen.evtchn_pending_sel) : "0" (evtchn_pending_sel)); WRITE_ONCE(vi->evtchn_upcall_pending, 1); } else { u32 evtchn_pending_sel32 = evtchn_pending_sel; struct compat_vcpu_info *vi = gpc->khva; asm volatile(LOCK_PREFIX "orl %0, %1\n" "notl %0\n" LOCK_PREFIX "andl %0, %2\n" : "=r" (evtchn_pending_sel32), "+m" (vi->evtchn_pending_sel), "+m" (v->arch.xen.evtchn_pending_sel) : "0" (evtchn_pending_sel32)); WRITE_ONCE(vi->evtchn_upcall_pending, 1); } read_unlock_irqrestore(&gpc->lock, flags); /* For the per-vCPU lapic vector, deliver it as MSI. */ if (v->arch.xen.upcall_vector) kvm_xen_inject_vcpu_vector(v); mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); } int __kvm_xen_has_interrupt(struct kvm_vcpu *v) { struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache; unsigned long flags; u8 rc = 0; /* * If the global upcall vector (HVMIRQ_callback_vector) is set and * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending. */ /* No need for compat handling here */ BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) != offsetof(struct compat_vcpu_info, evtchn_upcall_pending)); BUILD_BUG_ON(sizeof(rc) != sizeof_field(struct vcpu_info, evtchn_upcall_pending)); BUILD_BUG_ON(sizeof(rc) != sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); read_lock_irqsave(&gpc->lock, flags); while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { read_unlock_irqrestore(&gpc->lock, flags); /* * This function gets called from kvm_vcpu_block() after setting the * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately * from a HLT. So we really mustn't sleep. If the page ended up absent * at that point, just return 1 in order to trigger an immediate wake, * and we'll end up getting called again from a context where we *can* * fault in the page and wait for it. */ if (in_atomic() || !task_is_running(current)) return 1; if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) { /* * If this failed, userspace has screwed up the * vcpu_info mapping. No interrupts for you. */ return 0; } read_lock_irqsave(&gpc->lock, flags); } rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending; read_unlock_irqrestore(&gpc->lock, flags); return rc; } int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) { int r = -ENOENT; switch (data->type) { case KVM_XEN_ATTR_TYPE_LONG_MODE: if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) { r = -EINVAL; } else { mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.long_mode = !!data->u.long_mode; mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; } break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: mutex_lock(&kvm->arch.xen.xen_lock); r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); mutex_unlock(&kvm->arch.xen.xen_lock); break; case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: if (data->u.vector && data->u.vector < 0x10) r = -EINVAL; else { mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.upcall_vector = data->u.vector; mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; } break; case KVM_XEN_ATTR_TYPE_EVTCHN: r = kvm_xen_setattr_evtchn(kvm, data); break; case KVM_XEN_ATTR_TYPE_XEN_VERSION: mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.xen_version = data->u.xen_version; mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; break; case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG: if (!sched_info_on()) { r = -EOPNOTSUPP; break; } mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag; mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; break; default: break; } return r; } int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) { int r = -ENOENT; mutex_lock(&kvm->arch.xen.xen_lock); switch (data->type) { case KVM_XEN_ATTR_TYPE_LONG_MODE: data->u.long_mode = kvm->arch.xen.long_mode; r = 0; break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: if (kvm->arch.xen.shinfo_cache.active) data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); else data->u.shared_info.gfn = KVM_XEN_INVALID_GFN; r = 0; break; case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: data->u.vector = kvm->arch.xen.upcall_vector; r = 0; break; case KVM_XEN_ATTR_TYPE_XEN_VERSION: data->u.xen_version = kvm->arch.xen.xen_version; r = 0; break; case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG: if (!sched_info_on()) { r = -EOPNOTSUPP; break; } data->u.runstate_update_flag = kvm->arch.xen.runstate_update_flag; r = 0; break; default: break; } mutex_unlock(&kvm->arch.xen.xen_lock); return r; } int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) { int idx, r = -ENOENT; mutex_lock(&vcpu->kvm->arch.xen.xen_lock); idx = srcu_read_lock(&vcpu->kvm->srcu); switch (data->type) { case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: /* No compat necessary here. */ BUILD_BUG_ON(sizeof(struct vcpu_info) != sizeof(struct compat_vcpu_info)); BUILD_BUG_ON(offsetof(struct vcpu_info, time) != offsetof(struct compat_vcpu_info, time)); if (data->u.gpa == KVM_XEN_INVALID_GPA) { kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); r = 0; break; } r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache, data->u.gpa, sizeof(struct vcpu_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); break; case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: if (data->u.gpa == KVM_XEN_INVALID_GPA) { kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache); r = 0; break; } r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_time_info_cache, data->u.gpa, sizeof(struct pvclock_vcpu_time_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); break; case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: { size_t sz, sz1, sz2; if (!sched_info_on()) { r = -EOPNOTSUPP; break; } if (data->u.gpa == KVM_XEN_INVALID_GPA) { r = 0; deactivate_out: kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache); kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache); break; } /* * If the guest switches to 64-bit mode after setting the runstate * address, that's actually OK. kvm_xen_update_runstate_guest() * will cope. */ if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode) sz = sizeof(struct vcpu_runstate_info); else sz = sizeof(struct compat_vcpu_runstate_info); /* How much fits in the (first) page? */ sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK); r = kvm_gpc_activate(&vcpu->arch.xen.runstate_cache, data->u.gpa, sz1); if (r) goto deactivate_out; /* Either map the second page, or deactivate the second GPC */ if (sz1 >= sz) { kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache); } else { sz2 = sz - sz1; BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK); r = kvm_gpc_activate(&vcpu->arch.xen.runstate2_cache, data->u.gpa + sz1, sz2); if (r) goto deactivate_out; } kvm_xen_update_runstate_guest(vcpu, false); break; } case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: if (!sched_info_on()) { r = -EOPNOTSUPP; break; } if (data->u.runstate.state > RUNSTATE_offline) { r = -EINVAL; break; } kvm_xen_update_runstate(vcpu, data->u.runstate.state); r = 0; break; case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA: if (!sched_info_on()) { r = -EOPNOTSUPP; break; } if (data->u.runstate.state > RUNSTATE_offline) { r = -EINVAL; break; } if (data->u.runstate.state_entry_time != (data->u.runstate.time_running + data->u.runstate.time_runnable + data->u.runstate.time_blocked + data->u.runstate.time_offline)) { r = -EINVAL; break; } if (get_kvmclock_ns(vcpu->kvm) < data->u.runstate.state_entry_time) { r = -EINVAL; break; } vcpu->arch.xen.current_runstate = data->u.runstate.state; vcpu->arch.xen.runstate_entry_time = data->u.runstate.state_entry_time; vcpu->arch.xen.runstate_times[RUNSTATE_running] = data->u.runstate.time_running; vcpu->arch.xen.runstate_times[RUNSTATE_runnable] = data->u.runstate.time_runnable; vcpu->arch.xen.runstate_times[RUNSTATE_blocked] = data->u.runstate.time_blocked; vcpu->arch.xen.runstate_times[RUNSTATE_offline] = data->u.runstate.time_offline; vcpu->arch.xen.last_steal = current->sched_info.run_delay; r = 0; break; case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST: if (!sched_info_on()) { r = -EOPNOTSUPP; break; } if (data->u.runstate.state > RUNSTATE_offline && data->u.runstate.state != (u64)-1) { r = -EINVAL; break; } /* The adjustment must add up */ if (data->u.runstate.state_entry_time != (data->u.runstate.time_running + data->u.runstate.time_runnable + data->u.runstate.time_blocked + data->u.runstate.time_offline)) { r = -EINVAL; break; } if (get_kvmclock_ns(vcpu->kvm) < (vcpu->arch.xen.runstate_entry_time + data->u.runstate.state_entry_time)) { r = -EINVAL; break; } vcpu->arch.xen.runstate_entry_time += data->u.runstate.state_entry_time; vcpu->arch.xen.runstate_times[RUNSTATE_running] += data->u.runstate.time_running; vcpu->arch.xen.runstate_times[RUNSTATE_runnable] += data->u.runstate.time_runnable; vcpu->arch.xen.runstate_times[RUNSTATE_blocked] += data->u.runstate.time_blocked; vcpu->arch.xen.runstate_times[RUNSTATE_offline] += data->u.runstate.time_offline; if (data->u.runstate.state <= RUNSTATE_offline) kvm_xen_update_runstate(vcpu, data->u.runstate.state); else if (vcpu->arch.xen.runstate_cache.active) kvm_xen_update_runstate_guest(vcpu, false); r = 0; break; case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID: if (data->u.vcpu_id >= KVM_MAX_VCPUS) r = -EINVAL; else { vcpu->arch.xen.vcpu_id = data->u.vcpu_id; r = 0; } break; case KVM_XEN_VCPU_ATTR_TYPE_TIMER: if (data->u.timer.port && data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) { r = -EINVAL; break; } if (!vcpu->arch.xen.timer.function) kvm_xen_init_timer(vcpu); /* Stop the timer (if it's running) before changing the vector */ kvm_xen_stop_timer(vcpu); vcpu->arch.xen.timer_virq = data->u.timer.port; /* Start the timer if the new value has a valid vector+expiry. */ if (data->u.timer.port && data->u.timer.expires_ns) kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, data->u.timer.expires_ns - get_kvmclock_ns(vcpu->kvm)); r = 0; break; case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR: if (data->u.vector && data->u.vector < 0x10) r = -EINVAL; else { vcpu->arch.xen.upcall_vector = data->u.vector; r = 0; } break; default: break; } srcu_read_unlock(&vcpu->kvm->srcu, idx); mutex_unlock(&vcpu->kvm->arch.xen.xen_lock); return r; } int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) { int r = -ENOENT; mutex_lock(&vcpu->kvm->arch.xen.xen_lock); switch (data->type) { case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: if (vcpu->arch.xen.vcpu_info_cache.active) data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa; else data->u.gpa = KVM_XEN_INVALID_GPA; r = 0; break; case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: if (vcpu->arch.xen.vcpu_time_info_cache.active) data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa; else data->u.gpa = KVM_XEN_INVALID_GPA; r = 0; break; case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: if (!sched_info_on()) { r = -EOPNOTSUPP; break; } if (vcpu->arch.xen.runstate_cache.active) { data->u.gpa = vcpu->arch.xen.runstate_cache.gpa; r = 0; } break; case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: if (!sched_info_on()) { r = -EOPNOTSUPP; break; } data->u.runstate.state = vcpu->arch.xen.current_runstate; r = 0; break; case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA: if (!sched_info_on()) { r = -EOPNOTSUPP; break; } data->u.runstate.state = vcpu->arch.xen.current_runstate; data->u.runstate.state_entry_time = vcpu->arch.xen.runstate_entry_time; data->u.runstate.time_running = vcpu->arch.xen.runstate_times[RUNSTATE_running]; data->u.runstate.time_runnable = vcpu->arch.xen.runstate_times[RUNSTATE_runnable]; data->u.runstate.time_blocked = vcpu->arch.xen.runstate_times[RUNSTATE_blocked]; data->u.runstate.time_offline = vcpu->arch.xen.runstate_times[RUNSTATE_offline]; r = 0; break; case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST: r = -EINVAL; break; case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID: data->u.vcpu_id = vcpu->arch.xen.vcpu_id; r = 0; break; case KVM_XEN_VCPU_ATTR_TYPE_TIMER: /* * Ensure a consistent snapshot of state is captured, with a * timer either being pending, or the event channel delivered * to the corresponding bit in the shared_info. Not still * lurking in the timer_pending flag for deferred delivery. * Purely as an optimisation, if the timer_expires field is * zero, that means the timer isn't active (or even in the * timer_pending flag) and there is no need to cancel it. */ if (vcpu->arch.xen.timer_expires) { hrtimer_cancel(&vcpu->arch.xen.timer); kvm_xen_inject_timer_irqs(vcpu); } data->u.timer.port = vcpu->arch.xen.timer_virq; data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; data->u.timer.expires_ns = vcpu->arch.xen.timer_expires; /* * The hrtimer may trigger and raise the IRQ immediately, * while the returned state causes it to be set up and * raised again on the destination system after migration. * That's fine, as the guest won't even have had a chance * to run and handle the interrupt. Asserting an already * pending event channel is idempotent. */ if (vcpu->arch.xen.timer_expires) hrtimer_start_expires(&vcpu->arch.xen.timer, HRTIMER_MODE_ABS_HARD); r = 0; break; case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR: data->u.vector = vcpu->arch.xen.upcall_vector; r = 0; break; default: break; } mutex_unlock(&vcpu->kvm->arch.xen.xen_lock); return r; } int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) { struct kvm *kvm = vcpu->kvm; u32 page_num = data & ~PAGE_MASK; u64 page_addr = data & PAGE_MASK; bool lm = is_long_mode(vcpu); /* Latch long_mode for shared_info pages etc. */ vcpu->kvm->arch.xen.long_mode = lm; /* * If Xen hypercall intercept is enabled, fill the hypercall * page with VMCALL/VMMCALL instructions since that's what * we catch. Else the VMM has provided the hypercall pages * with instructions of its own choosing, so use those. */ if (kvm_xen_hypercall_enabled(kvm)) { u8 instructions[32]; int i; if (page_num) return 1; /* mov imm32, %eax */ instructions[0] = 0xb8; /* vmcall / vmmcall */ static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5); /* ret */ instructions[8] = 0xc3; /* int3 to pad */ memset(instructions + 9, 0xcc, sizeof(instructions) - 9); for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) { *(u32 *)&instructions[1] = i; if (kvm_vcpu_write_guest(vcpu, page_addr + (i * sizeof(instructions)), instructions, sizeof(instructions))) return 1; } } else { /* * Note, truncation is a non-issue as 'lm' is guaranteed to be * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes. */ hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64 : kvm->arch.xen_hvm_config.blob_addr_32; u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 : kvm->arch.xen_hvm_config.blob_size_32; u8 *page; int ret; if (page_num >= blob_size) return 1; blob_addr += page_num * PAGE_SIZE; page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE); if (IS_ERR(page)) return PTR_ERR(page); ret = kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE); kfree(page); if (ret) return 1; } return 0; } int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) { /* Only some feature flags need to be *enabled* by userspace */ u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_EVTCHN_SEND | KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE; u32 old_flags; if (xhc->flags & ~permitted_flags) return -EINVAL; /* * With hypercall interception the kernel generates its own * hypercall page so it must not be provided. */ if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) && (xhc->blob_addr_32 || xhc->blob_addr_64 || xhc->blob_size_32 || xhc->blob_size_64)) return -EINVAL; mutex_lock(&kvm->arch.xen.xen_lock); if (xhc->msr && !kvm->arch.xen_hvm_config.msr) static_branch_inc(&kvm_xen_enabled.key); else if (!xhc->msr && kvm->arch.xen_hvm_config.msr) static_branch_slow_dec_deferred(&kvm_xen_enabled); old_flags = kvm->arch.xen_hvm_config.flags; memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc)); mutex_unlock(&kvm->arch.xen.xen_lock); if ((old_flags ^ xhc->flags) & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE) kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); return 0; } static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) { kvm_rax_write(vcpu, result); return kvm_skip_emulated_instruction(vcpu); } static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip))) return 1; return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result); } static inline int max_evtchn_port(struct kvm *kvm) { if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) return EVTCHN_2L_NR_CHANNELS; else return COMPAT_EVTCHN_2L_NR_CHANNELS; } static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, evtchn_port_t *ports) { struct kvm *kvm = vcpu->kvm; struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; unsigned long *pending_bits; unsigned long flags; bool ret = true; int idx, i; idx = srcu_read_lock(&kvm->srcu); read_lock_irqsave(&gpc->lock, flags); if (!kvm_gpc_check(gpc, PAGE_SIZE)) goto out_rcu; ret = false; if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { struct shared_info *shinfo = gpc->khva; pending_bits = (unsigned long *)&shinfo->evtchn_pending; } else { struct compat_shared_info *shinfo = gpc->khva; pending_bits = (unsigned long *)&shinfo->evtchn_pending; } for (i = 0; i < nr_ports; i++) { if (test_bit(ports[i], pending_bits)) { ret = true; break; } } out_rcu: read_unlock_irqrestore(&gpc->lock, flags); srcu_read_unlock(&kvm->srcu, idx); return ret; } static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, u64 param, u64 *r) { struct sched_poll sched_poll; evtchn_port_t port, *ports; struct x86_exception e; int i; if (!lapic_in_kernel(vcpu) || !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) return false; if (IS_ENABLED(CONFIG_64BIT) && !longmode) { struct compat_sched_poll sp32; /* Sanity check that the compat struct definition is correct */ BUILD_BUG_ON(sizeof(sp32) != 16); if (kvm_read_guest_virt(vcpu, param, &sp32, sizeof(sp32), &e)) { *r = -EFAULT; return true; } /* * This is a 32-bit pointer to an array of evtchn_port_t which * are uint32_t, so once it's converted no further compat * handling is needed. */ sched_poll.ports = (void *)(unsigned long)(sp32.ports); sched_poll.nr_ports = sp32.nr_ports; sched_poll.timeout = sp32.timeout; } else { if (kvm_read_guest_virt(vcpu, param, &sched_poll, sizeof(sched_poll), &e)) { *r = -EFAULT; return true; } } if (unlikely(sched_poll.nr_ports > 1)) { /* Xen (unofficially) limits number of pollers to 128 */ if (sched_poll.nr_ports > 128) { *r = -EINVAL; return true; } ports = kmalloc_array(sched_poll.nr_ports, sizeof(*ports), GFP_KERNEL); if (!ports) { *r = -ENOMEM; return true; } } else ports = &port; if (kvm_read_guest_virt(vcpu, (gva_t)sched_poll.ports, ports, sched_poll.nr_ports * sizeof(*ports), &e)) { *r = -EFAULT; return true; } for (i = 0; i < sched_poll.nr_ports; i++) { if (ports[i] >= max_evtchn_port(vcpu->kvm)) { *r = -EINVAL; goto out; } } if (sched_poll.nr_ports == 1) vcpu->arch.xen.poll_evtchn = port; else vcpu->arch.xen.poll_evtchn = -1; set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; if (sched_poll.timeout) mod_timer(&vcpu->arch.xen.poll_timer, jiffies + nsecs_to_jiffies(sched_poll.timeout)); kvm_vcpu_halt(vcpu); if (sched_poll.timeout) del_timer(&vcpu->arch.xen.poll_timer); vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } vcpu->arch.xen.poll_evtchn = 0; *r = 0; out: /* Really, this is only needed in case of timeout */ clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (unlikely(sched_poll.nr_ports > 1)) kfree(ports); return true; } static void cancel_evtchn_poll(struct timer_list *t) { struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer); kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); } static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, u64 param, u64 *r) { switch (cmd) { case SCHEDOP_poll: if (kvm_xen_schedop_poll(vcpu, longmode, param, r)) return true; fallthrough; case SCHEDOP_yield: kvm_vcpu_on_spin(vcpu, true); *r = 0; return true; default: break; } return false; } struct compat_vcpu_set_singleshot_timer { uint64_t timeout_abs_ns; uint32_t flags; } __attribute__((packed)); static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, int vcpu_id, u64 param, u64 *r) { struct vcpu_set_singleshot_timer oneshot; struct x86_exception e; s64 delta; if (!kvm_xen_timer_enabled(vcpu)) return false; switch (cmd) { case VCPUOP_set_singleshot_timer: if (vcpu->arch.xen.vcpu_id != vcpu_id) { *r = -EINVAL; return true; } /* * The only difference for 32-bit compat is the 4 bytes of * padding after the interesting part of the structure. So * for a faithful emulation of Xen we have to *try* to copy * the padding and return -EFAULT if we can't. Otherwise we * might as well just have copied the 12-byte 32-bit struct. */ BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) != offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns)); BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) != sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns)); BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) != offsetof(struct vcpu_set_singleshot_timer, flags)); BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) != sizeof_field(struct vcpu_set_singleshot_timer, flags)); if (kvm_read_guest_virt(vcpu, param, &oneshot, longmode ? sizeof(oneshot) : sizeof(struct compat_vcpu_set_singleshot_timer), &e)) { *r = -EFAULT; return true; } /* A delta <= 0 results in an immediate callback, which is what we want */ delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm); kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta); *r = 0; return true; case VCPUOP_stop_singleshot_timer: if (vcpu->arch.xen.vcpu_id != vcpu_id) { *r = -EINVAL; return true; } kvm_xen_stop_timer(vcpu); *r = 0; return true; } return false; } static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout, u64 *r) { if (!kvm_xen_timer_enabled(vcpu)) return false; if (timeout) { uint64_t guest_now = get_kvmclock_ns(vcpu->kvm); int64_t delta = timeout - guest_now; /* Xen has a 'Linux workaround' in do_set_timer_op() which * checks for negative absolute timeout values (caused by * integer overflow), and for values about 13 days in the * future (2^50ns) which would be caused by jiffies * overflow. For those cases, it sets the timeout 100ms in * the future (not *too* soon, since if a guest really did * set a long timeout on purpose we don't want to keep * churning CPU time by waking it up). */ if (unlikely((int64_t)timeout < 0 || (delta > 0 && (uint32_t) (delta >> 50) != 0))) { delta = 100 * NSEC_PER_MSEC; timeout = guest_now + delta; } kvm_xen_start_timer(vcpu, timeout, delta); } else { kvm_xen_stop_timer(vcpu); } *r = 0; return true; } int kvm_xen_hypercall(struct kvm_vcpu *vcpu) { bool longmode; u64 input, params[6], r = -ENOSYS; bool handled = false; u8 cpl; input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX); /* Hyper-V hypercalls get bit 31 set in EAX */ if ((input & 0x80000000) && kvm_hv_hypercall_enabled(vcpu)) return kvm_hv_hypercall(vcpu); longmode = is_64_bit_hypercall(vcpu); if (!longmode) { params[0] = (u32)kvm_rbx_read(vcpu); params[1] = (u32)kvm_rcx_read(vcpu); params[2] = (u32)kvm_rdx_read(vcpu); params[3] = (u32)kvm_rsi_read(vcpu); params[4] = (u32)kvm_rdi_read(vcpu); params[5] = (u32)kvm_rbp_read(vcpu); } #ifdef CONFIG_X86_64 else { params[0] = (u64)kvm_rdi_read(vcpu); params[1] = (u64)kvm_rsi_read(vcpu); params[2] = (u64)kvm_rdx_read(vcpu); params[3] = (u64)kvm_r10_read(vcpu); params[4] = (u64)kvm_r8_read(vcpu); params[5] = (u64)kvm_r9_read(vcpu); } #endif cpl = static_call(kvm_x86_get_cpl)(vcpu); trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2], params[3], params[4], params[5]); /* * Only allow hypercall acceleration for CPL0. The rare hypercalls that * are permitted in guest userspace can be handled by the VMM. */ if (unlikely(cpl > 0)) goto handle_in_userspace; switch (input) { case __HYPERVISOR_xen_version: if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) { r = vcpu->kvm->arch.xen.xen_version; handled = true; } break; case __HYPERVISOR_event_channel_op: if (params[0] == EVTCHNOP_send) handled = kvm_xen_hcall_evtchn_send(vcpu, params[1], &r); break; case __HYPERVISOR_sched_op: handled = kvm_xen_hcall_sched_op(vcpu, longmode, params[0], params[1], &r); break; case __HYPERVISOR_vcpu_op: handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, params[0], params[1], params[2], &r); break; case __HYPERVISOR_set_timer_op: { u64 timeout = params[0]; /* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */ if (!longmode) timeout |= params[1] << 32; handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r); break; } default: break; } if (handled) return kvm_xen_hypercall_set_result(vcpu, r); handle_in_userspace: vcpu->run->exit_reason = KVM_EXIT_XEN; vcpu->run->xen.type = KVM_EXIT_XEN_HCALL; vcpu->run->xen.u.hcall.longmode = longmode; vcpu->run->xen.u.hcall.cpl = cpl; vcpu->run->xen.u.hcall.input = input; vcpu->run->xen.u.hcall.params[0] = params[0]; vcpu->run->xen.u.hcall.params[1] = params[1]; vcpu->run->xen.u.hcall.params[2] = params[2]; vcpu->run->xen.u.hcall.params[3] = params[3]; vcpu->run->xen.u.hcall.params[4] = params[4]; vcpu->run->xen.u.hcall.params[5] = params[5]; vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = kvm_xen_hypercall_complete_userspace; return 0; } static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port) { int poll_evtchn = vcpu->arch.xen.poll_evtchn; if ((poll_evtchn == port || poll_evtchn == -1) && test_and_clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask)) { kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); } } /* * The return value from this function is propagated to kvm_set_irq() API, * so it returns: * < 0 Interrupt was ignored (masked or not delivered for other reasons) * = 0 Interrupt was coalesced (previous irq is still pending) * > 0 Number of CPUs interrupt was delivered to * * It is also called directly from kvm_arch_set_irq_inatomic(), where the * only check on its return value is a comparison with -EWOULDBLOCK'. */ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) { struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; struct kvm_vcpu *vcpu; unsigned long *pending_bits, *mask_bits; unsigned long flags; int port_word_bit; bool kick_vcpu = false; int vcpu_idx, idx, rc; vcpu_idx = READ_ONCE(xe->vcpu_idx); if (vcpu_idx >= 0) vcpu = kvm_get_vcpu(kvm, vcpu_idx); else { vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id); if (!vcpu) return -EINVAL; WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx); } if (!vcpu->arch.xen.vcpu_info_cache.active) return -EINVAL; if (xe->port >= max_evtchn_port(kvm)) return -EINVAL; rc = -EWOULDBLOCK; idx = srcu_read_lock(&kvm->srcu); read_lock_irqsave(&gpc->lock, flags); if (!kvm_gpc_check(gpc, PAGE_SIZE)) goto out_rcu; if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { struct shared_info *shinfo = gpc->khva; pending_bits = (unsigned long *)&shinfo->evtchn_pending; mask_bits = (unsigned long *)&shinfo->evtchn_mask; port_word_bit = xe->port / 64; } else { struct compat_shared_info *shinfo = gpc->khva; pending_bits = (unsigned long *)&shinfo->evtchn_pending; mask_bits = (unsigned long *)&shinfo->evtchn_mask; port_word_bit = xe->port / 32; } /* * If this port wasn't already set, and if it isn't masked, then * we try to set the corresponding bit in the in-kernel shadow of * evtchn_pending_sel for the target vCPU. And if *that* wasn't * already set, then we kick the vCPU in question to write to the * *real* evtchn_pending_sel in its own guest vcpu_info struct. */ if (test_and_set_bit(xe->port, pending_bits)) { rc = 0; /* It was already raised */ } else if (test_bit(xe->port, mask_bits)) { rc = -ENOTCONN; /* Masked */ kvm_xen_check_poller(vcpu, xe->port); } else { rc = 1; /* Delivered to the bitmap in shared_info. */ /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */ read_unlock_irqrestore(&gpc->lock, flags); gpc = &vcpu->arch.xen.vcpu_info_cache; read_lock_irqsave(&gpc->lock, flags); if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { /* * Could not access the vcpu_info. Set the bit in-kernel * and prod the vCPU to deliver it for itself. */ if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) kick_vcpu = true; goto out_rcu; } if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { struct vcpu_info *vcpu_info = gpc->khva; if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) { WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); kick_vcpu = true; } } else { struct compat_vcpu_info *vcpu_info = gpc->khva; if (!test_and_set_bit(port_word_bit, (unsigned long *)&vcpu_info->evtchn_pending_sel)) { WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); kick_vcpu = true; } } /* For the per-vCPU lapic vector, deliver it as MSI. */ if (kick_vcpu && vcpu->arch.xen.upcall_vector) { kvm_xen_inject_vcpu_vector(vcpu); kick_vcpu = false; } } out_rcu: read_unlock_irqrestore(&gpc->lock, flags); srcu_read_unlock(&kvm->srcu, idx); if (kick_vcpu) { kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); } return rc; } static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) { bool mm_borrowed = false; int rc; rc = kvm_xen_set_evtchn_fast(xe, kvm); if (rc != -EWOULDBLOCK) return rc; if (current->mm != kvm->mm) { /* * If not on a thread which already belongs to this KVM, * we'd better be in the irqfd workqueue. */ if (WARN_ON_ONCE(current->mm)) return -EINVAL; kthread_use_mm(kvm->mm); mm_borrowed = true; } mutex_lock(&kvm->arch.xen.xen_lock); /* * It is theoretically possible for the page to be unmapped * and the MMU notifier to invalidate the shared_info before * we even get to use it. In that case, this looks like an * infinite loop. It was tempting to do it via the userspace * HVA instead... but that just *hides* the fact that it's * an infinite loop, because if a fault occurs and it waits * for the page to come back, it can *still* immediately * fault and have to wait again, repeatedly. * * Conversely, the page could also have been reinstated by * another thread before we even obtain the mutex above, so * check again *first* before remapping it. */ do { struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; int idx; rc = kvm_xen_set_evtchn_fast(xe, kvm); if (rc != -EWOULDBLOCK) break; idx = srcu_read_lock(&kvm->srcu); rc = kvm_gpc_refresh(gpc, PAGE_SIZE); srcu_read_unlock(&kvm->srcu, idx); } while(!rc); mutex_unlock(&kvm->arch.xen.xen_lock); if (mm_borrowed) kthread_unuse_mm(kvm->mm); return rc; } /* This is the version called from kvm_set_irq() as the .set function */ static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) { if (!level) return -EINVAL; return kvm_xen_set_evtchn(&e->xen_evtchn, kvm); } /* * Set up an event channel interrupt from the KVM IRQ routing table. * Used for e.g. PIRQ from passed through physical devices. */ int kvm_xen_setup_evtchn(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { struct kvm_vcpu *vcpu; if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm)) return -EINVAL; /* We only support 2 level event channels for now */ if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) return -EINVAL; /* * Xen gives us interesting mappings from vCPU index to APIC ID, * which means kvm_get_vcpu_by_id() has to iterate over all vCPUs * to find it. Do that once at setup time, instead of every time. * But beware that on live update / live migration, the routing * table might be reinstated before the vCPU threads have finished * recreating their vCPUs. */ vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu); if (vcpu) e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx; else e->xen_evtchn.vcpu_idx = -1; e->xen_evtchn.port = ue->u.xen_evtchn.port; e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu; e->xen_evtchn.priority = ue->u.xen_evtchn.priority; e->set = evtchn_set_fn; return 0; } /* * Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl. */ int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe) { struct kvm_xen_evtchn e; int ret; if (!uxe->port || uxe->port >= max_evtchn_port(kvm)) return -EINVAL; /* We only support 2 level event channels for now */ if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) return -EINVAL; e.port = uxe->port; e.vcpu_id = uxe->vcpu; e.vcpu_idx = -1; e.priority = uxe->priority; ret = kvm_xen_set_evtchn(&e, kvm); /* * None of that 'return 1 if it actually got delivered' nonsense. * We don't care if it was masked (-ENOTCONN) either. */ if (ret > 0 || ret == -ENOTCONN) ret = 0; return ret; } /* * Support for *outbound* event channel events via the EVTCHNOP_send hypercall. */ struct evtchnfd { u32 send_port; u32 type; union { struct kvm_xen_evtchn port; struct { u32 port; /* zero */ struct eventfd_ctx *ctx; } eventfd; } deliver; }; /* * Update target vCPU or priority for a registered sending channel. */ static int kvm_xen_eventfd_update(struct kvm *kvm, struct kvm_xen_hvm_attr *data) { u32 port = data->u.evtchn.send_port; struct evtchnfd *evtchnfd; int ret; /* Protect writes to evtchnfd as well as the idr lookup. */ mutex_lock(&kvm->arch.xen.xen_lock); evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port); ret = -ENOENT; if (!evtchnfd) goto out_unlock; /* For an UPDATE, nothing may change except the priority/vcpu */ ret = -EINVAL; if (evtchnfd->type != data->u.evtchn.type) goto out_unlock; /* * Port cannot change, and if it's zero that was an eventfd * which can't be changed either. */ if (!evtchnfd->deliver.port.port || evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port) goto out_unlock; /* We only support 2 level event channels for now */ if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) goto out_unlock; evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) { evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu; evtchnfd->deliver.port.vcpu_idx = -1; } ret = 0; out_unlock: mutex_unlock(&kvm->arch.xen.xen_lock); return ret; } /* * Configure the target (eventfd or local port delivery) for sending on * a given event channel. */ static int kvm_xen_eventfd_assign(struct kvm *kvm, struct kvm_xen_hvm_attr *data) { u32 port = data->u.evtchn.send_port; struct eventfd_ctx *eventfd = NULL; struct evtchnfd *evtchnfd; int ret = -EINVAL; evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL); if (!evtchnfd) return -ENOMEM; switch(data->u.evtchn.type) { case EVTCHNSTAT_ipi: /* IPI must map back to the same port# */ if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port) goto out_noeventfd; /* -EINVAL */ break; case EVTCHNSTAT_interdomain: if (data->u.evtchn.deliver.port.port) { if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm)) goto out_noeventfd; /* -EINVAL */ } else { eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd); if (IS_ERR(eventfd)) { ret = PTR_ERR(eventfd); goto out_noeventfd; } } break; case EVTCHNSTAT_virq: case EVTCHNSTAT_closed: case EVTCHNSTAT_unbound: case EVTCHNSTAT_pirq: default: /* Unknown event channel type */ goto out; /* -EINVAL */ } evtchnfd->send_port = data->u.evtchn.send_port; evtchnfd->type = data->u.evtchn.type; if (eventfd) { evtchnfd->deliver.eventfd.ctx = eventfd; } else { /* We only support 2 level event channels for now */ if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) goto out; /* -EINVAL; */ evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port; evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu; evtchnfd->deliver.port.vcpu_idx = -1; evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; } mutex_lock(&kvm->arch.xen.xen_lock); ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1, GFP_KERNEL); mutex_unlock(&kvm->arch.xen.xen_lock); if (ret >= 0) return 0; if (ret == -ENOSPC) ret = -EEXIST; out: if (eventfd) eventfd_ctx_put(eventfd); out_noeventfd: kfree(evtchnfd); return ret; } static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port) { struct evtchnfd *evtchnfd; mutex_lock(&kvm->arch.xen.xen_lock); evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port); mutex_unlock(&kvm->arch.xen.xen_lock); if (!evtchnfd) return -ENOENT; synchronize_srcu(&kvm->srcu); if (!evtchnfd->deliver.port.port) eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); kfree(evtchnfd); return 0; } static int kvm_xen_eventfd_reset(struct kvm *kvm) { struct evtchnfd *evtchnfd, **all_evtchnfds; int i; int n = 0; mutex_lock(&kvm->arch.xen.xen_lock); /* * Because synchronize_srcu() cannot be called inside the * critical section, first collect all the evtchnfd objects * in an array as they are removed from evtchn_ports. */ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) n++; all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL); if (!all_evtchnfds) { mutex_unlock(&kvm->arch.xen.xen_lock); return -ENOMEM; } n = 0; idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { all_evtchnfds[n++] = evtchnfd; idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port); } mutex_unlock(&kvm->arch.xen.xen_lock); synchronize_srcu(&kvm->srcu); while (n--) { evtchnfd = all_evtchnfds[n]; if (!evtchnfd->deliver.port.port) eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); kfree(evtchnfd); } kfree(all_evtchnfds); return 0; } static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data) { u32 port = data->u.evtchn.send_port; if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET) return kvm_xen_eventfd_reset(kvm); if (!port || port >= max_evtchn_port(kvm)) return -EINVAL; if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN) return kvm_xen_eventfd_deassign(kvm, port); if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE) return kvm_xen_eventfd_update(kvm, data); if (data->u.evtchn.flags) return -EINVAL; return kvm_xen_eventfd_assign(kvm, data); } static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r) { struct evtchnfd *evtchnfd; struct evtchn_send send; struct x86_exception e; /* Sanity check: this structure is the same for 32-bit and 64-bit */ BUILD_BUG_ON(sizeof(send) != 4); if (kvm_read_guest_virt(vcpu, param, &send, sizeof(send), &e)) { *r = -EFAULT; return true; } /* * evtchnfd is protected by kvm->srcu; the idr lookup instead * is protected by RCU. */ rcu_read_lock(); evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port); rcu_read_unlock(); if (!evtchnfd) return false; if (evtchnfd->deliver.port.port) { int ret = kvm_xen_set_evtchn(&evtchnfd->deliver.port, vcpu->kvm); if (ret < 0 && ret != -ENOTCONN) return false; } else { eventfd_signal(evtchnfd->deliver.eventfd.ctx); } *r = 0; return true; } void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) { vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx; vcpu->arch.xen.poll_evtchn = 0; timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm, NULL, KVM_HOST_USES_PFN); kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm, NULL, KVM_HOST_USES_PFN); kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm, NULL, KVM_HOST_USES_PFN); kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm, NULL, KVM_HOST_USES_PFN); } void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) { if (kvm_xen_timer_enabled(vcpu)) kvm_xen_stop_timer(vcpu); kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache); kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache); kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache); del_timer_sync(&vcpu->arch.xen.poll_timer); } void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *entry; u32 function; if (!vcpu->arch.xen.cpuid.base) return; function = vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3); if (function > vcpu->arch.xen.cpuid.limit) return; entry = kvm_find_cpuid_entry_index(vcpu, function, 1); if (entry) { entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul; entry->edx = vcpu->arch.hv_clock.tsc_shift; } entry = kvm_find_cpuid_entry_index(vcpu, function, 2); if (entry) entry->eax = vcpu->arch.hw_tsc_khz; } void kvm_xen_init_vm(struct kvm *kvm) { mutex_init(&kvm->arch.xen.xen_lock); idr_init(&kvm->arch.xen.evtchn_ports); kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN); } void kvm_xen_destroy_vm(struct kvm *kvm) { struct evtchnfd *evtchnfd; int i; kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { if (!evtchnfd->deliver.port.port) eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); kfree(evtchnfd); } idr_destroy(&kvm->arch.xen.evtchn_ports); if (kvm->arch.xen_hvm_config.msr) static_branch_slow_dec_deferred(&kvm_xen_enabled); }
91 15 128 27 23 5 7 5 5 98 98 2 16 9 2 14 14 10 7 40 39 31 18 128 7 128 11 129 106 26 26 25 2 7 26 7 17 17 15 25 26 98 98 11 9 89 4 4 15 28 82 10 20 46 74 66 98 2 2 2 2 2 2 2 2 70 76 70 70 70 30 30 3 3 30 88 30 30 30 30 26 28 30 30 30 25 30 88 88 88 88 69 87 88 1 88 88 88 72 20 76 17 53 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * * TODO: try to use extents tree (instead of array) */ #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/log2.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" /* runs_tree is a continues memory. Try to avoid big size. */ #define NTFS3_RUN_MAX_BYTES 0x10000 struct ntfs_run { CLST vcn; /* Virtual cluster number. */ CLST len; /* Length in clusters. */ CLST lcn; /* Logical cluster number. */ }; /* * run_lookup - Lookup the index of a MCB entry that is first <= vcn. * * Case of success it will return non-zero value and set * @index parameter to index of entry been found. * Case of entry missing from list 'index' will be set to * point to insertion position for the entry question. */ static bool run_lookup(const struct runs_tree *run, CLST vcn, size_t *index) { size_t min_idx, max_idx, mid_idx; struct ntfs_run *r; if (!run->count) { *index = 0; return false; } min_idx = 0; max_idx = run->count - 1; /* Check boundary cases specially, 'cause they cover the often requests. */ r = run->runs; if (vcn < r->vcn) { *index = 0; return false; } if (vcn < r->vcn + r->len) { *index = 0; return true; } r += max_idx; if (vcn >= r->vcn + r->len) { *index = run->count; return false; } if (vcn >= r->vcn) { *index = max_idx; return true; } do { mid_idx = min_idx + ((max_idx - min_idx) >> 1); r = run->runs + mid_idx; if (vcn < r->vcn) { max_idx = mid_idx - 1; if (!mid_idx) break; } else if (vcn >= r->vcn + r->len) { min_idx = mid_idx + 1; } else { *index = mid_idx; return true; } } while (min_idx <= max_idx); *index = max_idx + 1; return false; } /* * run_consolidate - Consolidate runs starting from a given one. */ static void run_consolidate(struct runs_tree *run, size_t index) { size_t i; struct ntfs_run *r = run->runs + index; while (index + 1 < run->count) { /* * I should merge current run with next * if start of the next run lies inside one being tested. */ struct ntfs_run *n = r + 1; CLST end = r->vcn + r->len; CLST dl; /* Stop if runs are not aligned one to another. */ if (n->vcn > end) break; dl = end - n->vcn; /* * If range at index overlaps with next one * then I will either adjust it's start position * or (if completely matches) dust remove one from the list. */ if (dl > 0) { if (n->len <= dl) goto remove_next_range; n->len -= dl; n->vcn += dl; if (n->lcn != SPARSE_LCN) n->lcn += dl; dl = 0; } /* * Stop if sparse mode does not match * both current and next runs. */ if ((n->lcn == SPARSE_LCN) != (r->lcn == SPARSE_LCN)) { index += 1; r = n; continue; } /* * Check if volume block * of a next run lcn does not match * last volume block of the current run. */ if (n->lcn != SPARSE_LCN && n->lcn != r->lcn + r->len) break; /* * Next and current are siblings. * Eat/join. */ r->len += n->len - dl; remove_next_range: i = run->count - (index + 1); if (i > 1) memmove(n, n + 1, sizeof(*n) * (i - 1)); run->count -= 1; } } /* * run_is_mapped_full * * Return: True if range [svcn - evcn] is mapped. */ bool run_is_mapped_full(const struct runs_tree *run, CLST svcn, CLST evcn) { size_t i; const struct ntfs_run *r, *end; CLST next_vcn; if (!run_lookup(run, svcn, &i)) return false; end = run->runs + run->count; r = run->runs + i; for (;;) { next_vcn = r->vcn + r->len; if (next_vcn > evcn) return true; if (++r >= end) return false; if (r->vcn != next_vcn) return false; } } bool run_lookup_entry(const struct runs_tree *run, CLST vcn, CLST *lcn, CLST *len, size_t *index) { size_t idx; CLST gap; struct ntfs_run *r; /* Fail immediately if nrun was not touched yet. */ if (!run->runs) return false; if (!run_lookup(run, vcn, &idx)) return false; r = run->runs + idx; if (vcn >= r->vcn + r->len) return false; gap = vcn - r->vcn; if (r->len <= gap) return false; *lcn = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + gap); if (len) *len = r->len - gap; if (index) *index = idx; return true; } /* * run_truncate_head - Decommit the range before vcn. */ void run_truncate_head(struct runs_tree *run, CLST vcn) { size_t index; struct ntfs_run *r; if (run_lookup(run, vcn, &index)) { r = run->runs + index; if (vcn > r->vcn) { CLST dlen = vcn - r->vcn; r->vcn = vcn; r->len -= dlen; if (r->lcn != SPARSE_LCN) r->lcn += dlen; } if (!index) return; } r = run->runs; memmove(r, r + index, sizeof(*r) * (run->count - index)); run->count -= index; if (!run->count) { kvfree(run->runs); run->runs = NULL; run->allocated = 0; } } /* * run_truncate - Decommit the range after vcn. */ void run_truncate(struct runs_tree *run, CLST vcn) { size_t index; /* * If I hit the range then * I have to truncate one. * If range to be truncated is becoming empty * then it will entirely be removed. */ if (run_lookup(run, vcn, &index)) { struct ntfs_run *r = run->runs + index; r->len = vcn - r->vcn; if (r->len > 0) index += 1; } /* * At this point 'index' is set to position that * should be thrown away (including index itself) * Simple one - just set the limit. */ run->count = index; /* Do not reallocate array 'runs'. Only free if possible. */ if (!index) { kvfree(run->runs); run->runs = NULL; run->allocated = 0; } } /* * run_truncate_around - Trim head and tail if necessary. */ void run_truncate_around(struct runs_tree *run, CLST vcn) { run_truncate_head(run, vcn); if (run->count >= NTFS3_RUN_MAX_BYTES / sizeof(struct ntfs_run) / 2) run_truncate(run, (run->runs + (run->count >> 1))->vcn); } /* * run_add_entry * * Sets location to known state. * Run to be added may overlap with existing location. * * Return: false if of memory. */ bool run_add_entry(struct runs_tree *run, CLST vcn, CLST lcn, CLST len, bool is_mft) { size_t used, index; struct ntfs_run *r; bool inrange; CLST tail_vcn = 0, tail_len = 0, tail_lcn = 0; bool should_add_tail = false; /* * Lookup the insertion point. * * Execute bsearch for the entry containing * start position question. */ inrange = run_lookup(run, vcn, &index); /* * Shortcut here would be case of * range not been found but one been added * continues previous run. * This case I can directly make use of * existing range as my start point. */ if (!inrange && index > 0) { struct ntfs_run *t = run->runs + index - 1; if (t->vcn + t->len == vcn && (t->lcn == SPARSE_LCN) == (lcn == SPARSE_LCN) && (lcn == SPARSE_LCN || lcn == t->lcn + t->len)) { inrange = true; index -= 1; } } /* * At this point 'index' either points to the range * containing start position or to the insertion position * for a new range. * So first let's check if range I'm probing is here already. */ if (!inrange) { requires_new_range: /* * Range was not found. * Insert at position 'index' */ used = run->count * sizeof(struct ntfs_run); /* * Check allocated space. * If one is not enough to get one more entry * then it will be reallocated. */ if (run->allocated < used + sizeof(struct ntfs_run)) { size_t bytes; struct ntfs_run *new_ptr; /* Use power of 2 for 'bytes'. */ if (!used) { bytes = 64; } else if (used <= 16 * PAGE_SIZE) { if (is_power_of_2(run->allocated)) bytes = run->allocated << 1; else bytes = (size_t)1 << (2 + blksize_bits(used)); } else { bytes = run->allocated + (16 * PAGE_SIZE); } WARN_ON(!is_mft && bytes > NTFS3_RUN_MAX_BYTES); new_ptr = kvmalloc(bytes, GFP_KERNEL); if (!new_ptr) return false; r = new_ptr + index; memcpy(new_ptr, run->runs, index * sizeof(struct ntfs_run)); memcpy(r + 1, run->runs + index, sizeof(struct ntfs_run) * (run->count - index)); kvfree(run->runs); run->runs = new_ptr; run->allocated = bytes; } else { size_t i = run->count - index; r = run->runs + index; /* memmove appears to be a bottle neck here... */ if (i > 0) memmove(r + 1, r, sizeof(struct ntfs_run) * i); } r->vcn = vcn; r->lcn = lcn; r->len = len; run->count += 1; } else { r = run->runs + index; /* * If one of ranges was not allocated then we * have to split location we just matched and * insert current one. * A common case this requires tail to be reinserted * a recursive call. */ if (((lcn == SPARSE_LCN) != (r->lcn == SPARSE_LCN)) || (lcn != SPARSE_LCN && lcn != r->lcn + (vcn - r->vcn))) { CLST to_eat = vcn - r->vcn; CLST Tovcn = to_eat + len; should_add_tail = Tovcn < r->len; if (should_add_tail) { tail_lcn = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + Tovcn); tail_vcn = r->vcn + Tovcn; tail_len = r->len - Tovcn; } if (to_eat > 0) { r->len = to_eat; inrange = false; index += 1; goto requires_new_range; } /* lcn should match one were going to add. */ r->lcn = lcn; } /* * If existing range fits then were done. * Otherwise extend found one and fall back to range jocode. */ if (r->vcn + r->len < vcn + len) r->len += len - ((r->vcn + r->len) - vcn); } /* * And normalize it starting from insertion point. * It's possible that no insertion needed case if * start point lies within the range of an entry * that 'index' points to. */ if (inrange && index > 0) index -= 1; run_consolidate(run, index); run_consolidate(run, index + 1); /* * A special case. * We have to add extra range a tail. */ if (should_add_tail && !run_add_entry(run, tail_vcn, tail_lcn, tail_len, is_mft)) return false; return true; } /* run_collapse_range * * Helper for attr_collapse_range(), * which is helper for fallocate(collapse_range). */ bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len) { size_t index, eat; struct ntfs_run *r, *e, *eat_start, *eat_end; CLST end; if (WARN_ON(!run_lookup(run, vcn, &index))) return true; /* Should never be here. */ e = run->runs + run->count; r = run->runs + index; end = vcn + len; if (vcn > r->vcn) { if (r->vcn + r->len <= end) { /* Collapse tail of run .*/ r->len = vcn - r->vcn; } else if (r->lcn == SPARSE_LCN) { /* Collapse a middle part of sparsed run. */ r->len -= len; } else { /* Collapse a middle part of normal run, split. */ if (!run_add_entry(run, vcn, SPARSE_LCN, len, false)) return false; return run_collapse_range(run, vcn, len); } r += 1; } eat_start = r; eat_end = r; for (; r < e; r++) { CLST d; if (r->vcn >= end) { r->vcn -= len; continue; } if (r->vcn + r->len <= end) { /* Eat this run. */ eat_end = r + 1; continue; } d = end - r->vcn; if (r->lcn != SPARSE_LCN) r->lcn += d; r->len -= d; r->vcn -= len - d; } eat = eat_end - eat_start; memmove(eat_start, eat_end, (e - eat_end) * sizeof(*r)); run->count -= eat; return true; } /* run_insert_range * * Helper for attr_insert_range(), * which is helper for fallocate(insert_range). */ bool run_insert_range(struct runs_tree *run, CLST vcn, CLST len) { size_t index; struct ntfs_run *r, *e; if (WARN_ON(!run_lookup(run, vcn, &index))) return false; /* Should never be here. */ e = run->runs + run->count; r = run->runs + index; if (vcn > r->vcn) r += 1; for (; r < e; r++) r->vcn += len; r = run->runs + index; if (vcn > r->vcn) { /* split fragment. */ CLST len1 = vcn - r->vcn; CLST len2 = r->len - len1; CLST lcn2 = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + len1); r->len = len1; if (!run_add_entry(run, vcn + len, lcn2, len2, false)) return false; } if (!run_add_entry(run, vcn, SPARSE_LCN, len, false)) return false; return true; } /* * run_get_entry - Return index-th mapped region. */ bool run_get_entry(const struct runs_tree *run, size_t index, CLST *vcn, CLST *lcn, CLST *len) { const struct ntfs_run *r; if (index >= run->count) return false; r = run->runs + index; if (!r->len) return false; if (vcn) *vcn = r->vcn; if (lcn) *lcn = r->lcn; if (len) *len = r->len; return true; } /* * run_packed_size - Calculate the size of packed int64. */ #ifdef __BIG_ENDIAN static inline int run_packed_size(const s64 n) { const u8 *p = (const u8 *)&n + sizeof(n) - 1; if (n >= 0) { if (p[-7] || p[-6] || p[-5] || p[-4]) p -= 4; if (p[-3] || p[-2]) p -= 2; if (p[-1]) p -= 1; if (p[0] & 0x80) p -= 1; } else { if (p[-7] != 0xff || p[-6] != 0xff || p[-5] != 0xff || p[-4] != 0xff) p -= 4; if (p[-3] != 0xff || p[-2] != 0xff) p -= 2; if (p[-1] != 0xff) p -= 1; if (!(p[0] & 0x80)) p -= 1; } return (const u8 *)&n + sizeof(n) - p; } /* Full trusted function. It does not check 'size' for errors. */ static inline void run_pack_s64(u8 *run_buf, u8 size, s64 v) { const u8 *p = (u8 *)&v; switch (size) { case 8: run_buf[7] = p[0]; fallthrough; case 7: run_buf[6] = p[1]; fallthrough; case 6: run_buf[5] = p[2]; fallthrough; case 5: run_buf[4] = p[3]; fallthrough; case 4: run_buf[3] = p[4]; fallthrough; case 3: run_buf[2] = p[5]; fallthrough; case 2: run_buf[1] = p[6]; fallthrough; case 1: run_buf[0] = p[7]; } } /* Full trusted function. It does not check 'size' for errors. */ static inline s64 run_unpack_s64(const u8 *run_buf, u8 size, s64 v) { u8 *p = (u8 *)&v; switch (size) { case 8: p[0] = run_buf[7]; fallthrough; case 7: p[1] = run_buf[6]; fallthrough; case 6: p[2] = run_buf[5]; fallthrough; case 5: p[3] = run_buf[4]; fallthrough; case 4: p[4] = run_buf[3]; fallthrough; case 3: p[5] = run_buf[2]; fallthrough; case 2: p[6] = run_buf[1]; fallthrough; case 1: p[7] = run_buf[0]; } return v; } #else static inline int run_packed_size(const s64 n) { const u8 *p = (const u8 *)&n; if (n >= 0) { if (p[7] || p[6] || p[5] || p[4]) p += 4; if (p[3] || p[2]) p += 2; if (p[1]) p += 1; if (p[0] & 0x80) p += 1; } else { if (p[7] != 0xff || p[6] != 0xff || p[5] != 0xff || p[4] != 0xff) p += 4; if (p[3] != 0xff || p[2] != 0xff) p += 2; if (p[1] != 0xff) p += 1; if (!(p[0] & 0x80)) p += 1; } return 1 + p - (const u8 *)&n; } /* Full trusted function. It does not check 'size' for errors. */ static inline void run_pack_s64(u8 *run_buf, u8 size, s64 v) { const u8 *p = (u8 *)&v; /* memcpy( run_buf, &v, size); Is it faster? */ switch (size) { case 8: run_buf[7] = p[7]; fallthrough; case 7: run_buf[6] = p[6]; fallthrough; case 6: run_buf[5] = p[5]; fallthrough; case 5: run_buf[4] = p[4]; fallthrough; case 4: run_buf[3] = p[3]; fallthrough; case 3: run_buf[2] = p[2]; fallthrough; case 2: run_buf[1] = p[1]; fallthrough; case 1: run_buf[0] = p[0]; } } /* full trusted function. It does not check 'size' for errors */ static inline s64 run_unpack_s64(const u8 *run_buf, u8 size, s64 v) { u8 *p = (u8 *)&v; /* memcpy( &v, run_buf, size); Is it faster? */ switch (size) { case 8: p[7] = run_buf[7]; fallthrough; case 7: p[6] = run_buf[6]; fallthrough; case 6: p[5] = run_buf[5]; fallthrough; case 5: p[4] = run_buf[4]; fallthrough; case 4: p[3] = run_buf[3]; fallthrough; case 3: p[2] = run_buf[2]; fallthrough; case 2: p[1] = run_buf[1]; fallthrough; case 1: p[0] = run_buf[0]; } return v; } #endif /* * run_pack - Pack runs into buffer. * * packed_vcns - How much runs we have packed. * packed_size - How much bytes we have used run_buf. */ int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf, u32 run_buf_size, CLST *packed_vcns) { CLST next_vcn, vcn, lcn; CLST prev_lcn = 0; CLST evcn1 = svcn + len; const struct ntfs_run *r, *r_end; int packed_size = 0; size_t i; s64 dlcn; int offset_size, size_size, tmp; *packed_vcns = 0; if (!len) goto out; /* Check all required entries [svcn, encv1) available. */ if (!run_lookup(run, svcn, &i)) return -ENOENT; r_end = run->runs + run->count; r = run->runs + i; for (next_vcn = r->vcn + r->len; next_vcn < evcn1; next_vcn = r->vcn + r->len) { if (++r >= r_end || r->vcn != next_vcn) return -ENOENT; } /* Repeat cycle above and pack runs. Assume no errors. */ r = run->runs + i; len = svcn - r->vcn; vcn = svcn; lcn = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + len); len = r->len - len; for (;;) { next_vcn = vcn + len; if (next_vcn > evcn1) len = evcn1 - vcn; /* How much bytes required to pack len. */ size_size = run_packed_size(len); /* offset_size - How much bytes is packed dlcn. */ if (lcn == SPARSE_LCN) { offset_size = 0; dlcn = 0; } else { /* NOTE: lcn can be less than prev_lcn! */ dlcn = (s64)lcn - prev_lcn; offset_size = run_packed_size(dlcn); prev_lcn = lcn; } tmp = run_buf_size - packed_size - 2 - offset_size; if (tmp <= 0) goto out; /* Can we store this entire run. */ if (tmp < size_size) goto out; if (run_buf) { /* Pack run header. */ run_buf[0] = ((u8)(size_size | (offset_size << 4))); run_buf += 1; /* Pack the length of run. */ run_pack_s64(run_buf, size_size, len); run_buf += size_size; /* Pack the offset from previous LCN. */ run_pack_s64(run_buf, offset_size, dlcn); run_buf += offset_size; } packed_size += 1 + offset_size + size_size; *packed_vcns += len; if (packed_size + 1 >= run_buf_size || next_vcn >= evcn1) goto out; r += 1; vcn = r->vcn; lcn = r->lcn; len = r->len; } out: /* Store last zero. */ if (run_buf) run_buf[0] = 0; return packed_size + 1; } /* * run_unpack - Unpack packed runs from @run_buf. * * Return: Error if negative, or real used bytes. */ int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, int run_buf_size) { u64 prev_lcn, vcn64, lcn, next_vcn; const u8 *run_last, *run_0; bool is_mft = ino == MFT_REC_MFT; if (run_buf_size < 0) return -EINVAL; /* Check for empty. */ if (evcn + 1 == svcn) return 0; if (evcn < svcn) return -EINVAL; run_0 = run_buf; run_last = run_buf + run_buf_size; prev_lcn = 0; vcn64 = svcn; /* Read all runs the chain. */ /* size_size - How much bytes is packed len. */ while (run_buf < run_last) { /* size_size - How much bytes is packed len. */ u8 size_size = *run_buf & 0xF; /* offset_size - How much bytes is packed dlcn. */ u8 offset_size = *run_buf++ >> 4; u64 len; if (!size_size) break; /* * Unpack runs. * NOTE: Runs are stored little endian order * "len" is unsigned value, "dlcn" is signed. * Large positive number requires to store 5 bytes * e.g.: 05 FF 7E FF FF 00 00 00 */ if (size_size > 8) return -EINVAL; len = run_unpack_s64(run_buf, size_size, 0); /* Skip size_size. */ run_buf += size_size; if (!len) return -EINVAL; if (!offset_size) lcn = SPARSE_LCN64; else if (offset_size <= 8) { s64 dlcn; /* Initial value of dlcn is -1 or 0. */ dlcn = (run_buf[offset_size - 1] & 0x80) ? (s64)-1 : 0; dlcn = run_unpack_s64(run_buf, offset_size, dlcn); /* Skip offset_size. */ run_buf += offset_size; if (!dlcn) return -EINVAL; lcn = prev_lcn + dlcn; prev_lcn = lcn; } else return -EINVAL; next_vcn = vcn64 + len; /* Check boundary. */ if (next_vcn > evcn + 1) return -EINVAL; #ifndef CONFIG_NTFS3_64BIT_CLUSTER if (next_vcn > 0x100000000ull || (lcn + len) > 0x100000000ull) { ntfs_err( sbi->sb, "This driver is compiled without CONFIG_NTFS3_64BIT_CLUSTER (like windows driver).\n" "Volume contains 64 bits run: vcn %llx, lcn %llx, len %llx.\n" "Activate CONFIG_NTFS3_64BIT_CLUSTER to process this case", vcn64, lcn, len); return -EOPNOTSUPP; } #endif if (lcn != SPARSE_LCN64 && lcn + len > sbi->used.bitmap.nbits) { /* LCN range is out of volume. */ return -EINVAL; } if (!run) ; /* Called from check_attr(fslog.c) to check run. */ else if (run == RUN_DEALLOCATE) { /* * Called from ni_delete_all to free clusters * without storing in run. */ if (lcn != SPARSE_LCN64) mark_as_free_ex(sbi, lcn, len, true); } else if (vcn64 >= vcn) { if (!run_add_entry(run, vcn64, lcn, len, is_mft)) return -ENOMEM; } else if (next_vcn > vcn) { u64 dlen = vcn - vcn64; if (!run_add_entry(run, vcn, lcn + dlen, len - dlen, is_mft)) return -ENOMEM; } vcn64 = next_vcn; } if (vcn64 != evcn + 1) { /* Not expected length of unpacked runs. */ return -EINVAL; } return run_buf - run_0; } #ifdef NTFS3_CHECK_FREE_CLST /* * run_unpack_ex - Unpack packed runs from "run_buf". * * Checks unpacked runs to be used in bitmap. * * Return: Error if negative, or real used bytes. */ int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, int run_buf_size) { int ret, err; CLST next_vcn, lcn, len; size_t index; bool ok; struct wnd_bitmap *wnd; ret = run_unpack(run, sbi, ino, svcn, evcn, vcn, run_buf, run_buf_size); if (ret <= 0) return ret; if (!sbi->used.bitmap.sb || !run || run == RUN_DEALLOCATE) return ret; if (ino == MFT_REC_BADCLUST) return ret; next_vcn = vcn = svcn; wnd = &sbi->used.bitmap; for (ok = run_lookup_entry(run, vcn, &lcn, &len, &index); next_vcn <= evcn; ok = run_get_entry(run, ++index, &vcn, &lcn, &len)) { if (!ok || next_vcn != vcn) return -EINVAL; next_vcn = vcn + len; if (lcn == SPARSE_LCN) continue; if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) continue; down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); /* Check for free blocks. */ ok = wnd_is_used(wnd, lcn, len); up_read(&wnd->rw_lock); if (ok) continue; /* Looks like volume is corrupted. */ ntfs_set_state(sbi, NTFS_DIRTY_ERROR); if (down_write_trylock(&wnd->rw_lock)) { /* Mark all zero bits as used in range [lcn, lcn+len). */ size_t done; err = wnd_set_used_safe(wnd, lcn, len, &done); up_write(&wnd->rw_lock); if (err) return err; } } return ret; } #endif /* * run_get_highest_vcn * * Return the highest vcn from a mapping pairs array * it used while replaying log file. */ int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn) { u64 vcn64 = vcn; u8 size_size; while ((size_size = *run_buf & 0xF)) { u8 offset_size = *run_buf++ >> 4; u64 len; if (size_size > 8 || offset_size > 8) return -EINVAL; len = run_unpack_s64(run_buf, size_size, 0); if (!len) return -EINVAL; run_buf += size_size + offset_size; vcn64 += len; #ifndef CONFIG_NTFS3_64BIT_CLUSTER if (vcn64 > 0x100000000ull) return -EINVAL; #endif } *highest_vcn = vcn64 - 1; return 0; } /* * run_clone * * Make a copy of run */ int run_clone(const struct runs_tree *run, struct runs_tree *new_run) { size_t bytes = run->count * sizeof(struct ntfs_run); if (bytes > new_run->allocated) { struct ntfs_run *new_ptr = kvmalloc(bytes, GFP_KERNEL); if (!new_ptr) return -ENOMEM; kvfree(new_run->runs); new_run->runs = new_ptr; new_run->allocated = bytes; } memcpy(new_run->runs, run->runs, bytes); new_run->count = run->count; return 0; }
3 2 1 24 20 4 1 3 1 1 3 1 3 4 17 3 17 3 17 3 20 24 4 20 58 59 58 1 59 598 301 587 13 13 179 305 947 946 2 946 943 3 819 2 818 807 52 922 6 916 300 390 380 300 438 115 389 126 126 5 2 1 1 1 254 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 // SPDX-License-Identifier: GPL-2.0 /* * inode.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004,2019 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. * Copyright (C) 2019 Linux Foundation <gregkh@linuxfoundation.org> * * debugfs is for people to use instead of /proc or /sys. * See ./Documentation/core-api/kernel-api.rst for more details. */ #define pr_fmt(fmt) "debugfs: " fmt #include <linux/module.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/kobject.h> #include <linux/namei.h> #include <linux/debugfs.h> #include <linux/fsnotify.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/parser.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/security.h> #include "internal.h" #define DEBUGFS_DEFAULT_MODE 0700 static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS; /* * Don't allow access attributes to be changed whilst the kernel is locked down * so that we can use the file mode as part of a heuristic to determine whether * to lock down individual files. */ static int debugfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *ia) { int ret; if (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) { ret = security_locked_down(LOCKDOWN_DEBUGFS); if (ret) return ret; } return simple_setattr(&nop_mnt_idmap, dentry, ia); } static const struct inode_operations debugfs_file_inode_operations = { .setattr = debugfs_setattr, }; static const struct inode_operations debugfs_dir_inode_operations = { .lookup = simple_lookup, .setattr = debugfs_setattr, }; static const struct inode_operations debugfs_symlink_inode_operations = { .get_link = simple_get_link, .setattr = debugfs_setattr, }; static struct inode *debugfs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); simple_inode_init_ts(inode); } return inode; } struct debugfs_mount_opts { kuid_t uid; kgid_t gid; umode_t mode; /* Opt_* bitfield. */ unsigned int opts; }; enum { Opt_uid, Opt_gid, Opt_mode, Opt_err }; static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_mode, "mode=%o"}, {Opt_err, NULL} }; struct debugfs_fs_info { struct debugfs_mount_opts mount_opts; }; static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) { substring_t args[MAX_OPT_ARGS]; int option; int token; kuid_t uid; kgid_t gid; char *p; opts->opts = 0; opts->mode = DEBUGFS_DEFAULT_MODE; while ((p = strsep(&data, ",")) != NULL) { if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case Opt_uid: if (match_int(&args[0], &option)) return -EINVAL; uid = make_kuid(current_user_ns(), option); if (!uid_valid(uid)) return -EINVAL; opts->uid = uid; break; case Opt_gid: if (match_int(&args[0], &option)) return -EINVAL; gid = make_kgid(current_user_ns(), option); if (!gid_valid(gid)) return -EINVAL; opts->gid = gid; break; case Opt_mode: if (match_octal(&args[0], &option)) return -EINVAL; opts->mode = option & S_IALLUGO; break; /* * We might like to report bad mount options here; * but traditionally debugfs has ignored all mount options */ } opts->opts |= BIT(token); } return 0; } static void _debugfs_apply_options(struct super_block *sb, bool remount) { struct debugfs_fs_info *fsi = sb->s_fs_info; struct inode *inode = d_inode(sb->s_root); struct debugfs_mount_opts *opts = &fsi->mount_opts; /* * On remount, only reset mode/uid/gid if they were provided as mount * options. */ if (!remount || opts->opts & BIT(Opt_mode)) { inode->i_mode &= ~S_IALLUGO; inode->i_mode |= opts->mode; } if (!remount || opts->opts & BIT(Opt_uid)) inode->i_uid = opts->uid; if (!remount || opts->opts & BIT(Opt_gid)) inode->i_gid = opts->gid; } static void debugfs_apply_options(struct super_block *sb) { _debugfs_apply_options(sb, false); } static void debugfs_apply_options_remount(struct super_block *sb) { _debugfs_apply_options(sb, true); } static int debugfs_remount(struct super_block *sb, int *flags, char *data) { int err; struct debugfs_fs_info *fsi = sb->s_fs_info; sync_filesystem(sb); err = debugfs_parse_options(data, &fsi->mount_opts); if (err) goto fail; debugfs_apply_options_remount(sb); fail: return err; } static int debugfs_show_options(struct seq_file *m, struct dentry *root) { struct debugfs_fs_info *fsi = root->d_sb->s_fs_info; struct debugfs_mount_opts *opts = &fsi->mount_opts; if (!uid_eq(opts->uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, opts->uid)); if (!gid_eq(opts->gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, opts->gid)); if (opts->mode != DEBUGFS_DEFAULT_MODE) seq_printf(m, ",mode=%o", opts->mode); return 0; } static void debugfs_free_inode(struct inode *inode) { if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); free_inode_nonrcu(inode); } static const struct super_operations debugfs_super_operations = { .statfs = simple_statfs, .remount_fs = debugfs_remount, .show_options = debugfs_show_options, .free_inode = debugfs_free_inode, }; static void debugfs_release_dentry(struct dentry *dentry) { struct debugfs_fsdata *fsd = dentry->d_fsdata; if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) return; /* check it wasn't a dir (no fsdata) or automount (no real_fops) */ if (fsd && fsd->real_fops) { WARN_ON(!list_empty(&fsd->cancellations)); mutex_destroy(&fsd->cancellations_mtx); } kfree(fsd); } static struct vfsmount *debugfs_automount(struct path *path) { struct debugfs_fsdata *fsd = path->dentry->d_fsdata; return fsd->automount(path->dentry, d_inode(path->dentry)->i_private); } static const struct dentry_operations debugfs_dops = { .d_delete = always_delete_dentry, .d_release = debugfs_release_dentry, .d_automount = debugfs_automount, }; static int debug_fill_super(struct super_block *sb, void *data, int silent) { static const struct tree_descr debug_files[] = {{""}}; struct debugfs_fs_info *fsi; int err; fsi = kzalloc(sizeof(struct debugfs_fs_info), GFP_KERNEL); sb->s_fs_info = fsi; if (!fsi) { err = -ENOMEM; goto fail; } err = debugfs_parse_options(data, &fsi->mount_opts); if (err) goto fail; err = simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); if (err) goto fail; sb->s_op = &debugfs_super_operations; sb->s_d_op = &debugfs_dops; debugfs_apply_options(sb); return 0; fail: kfree(fsi); sb->s_fs_info = NULL; return err; } static struct dentry *debug_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { if (!(debugfs_allow & DEBUGFS_ALLOW_API)) return ERR_PTR(-EPERM); return mount_single(fs_type, flags, data, debug_fill_super); } static struct file_system_type debug_fs_type = { .owner = THIS_MODULE, .name = "debugfs", .mount = debug_mount, .kill_sb = kill_litter_super, }; MODULE_ALIAS_FS("debugfs"); /** * debugfs_lookup() - look up an existing debugfs file * @name: a pointer to a string containing the name of the file to look up. * @parent: a pointer to the parent dentry of the file. * * This function will return a pointer to a dentry if it succeeds. If the file * doesn't exist or an error occurs, %NULL will be returned. The returned * dentry must be passed to dput() when it is no longer needed. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) { struct dentry *dentry; if (!debugfs_initialized() || IS_ERR_OR_NULL(name) || IS_ERR(parent)) return NULL; if (!parent) parent = debugfs_mount->mnt_root; dentry = lookup_positive_unlocked(name, parent, strlen(name)); if (IS_ERR(dentry)) return NULL; return dentry; } EXPORT_SYMBOL_GPL(debugfs_lookup); static struct dentry *start_creating(const char *name, struct dentry *parent) { struct dentry *dentry; int error; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) return ERR_PTR(-EPERM); if (!debugfs_initialized()) return ERR_PTR(-ENOENT); pr_debug("creating file '%s'\n", name); if (IS_ERR(parent)) return parent; error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); if (error) { pr_err("Unable to pin filesystem for file '%s'\n", name); return ERR_PTR(error); } /* If the parent is not specified, we create it in the root. * We need the root dentry to do this, which is in the super * block. A pointer to that is in the struct vfsmount that we * have around. */ if (!parent) parent = debugfs_mount->mnt_root; inode_lock(d_inode(parent)); if (unlikely(IS_DEADDIR(d_inode(parent)))) dentry = ERR_PTR(-ENOENT); else dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { if (d_is_dir(dentry)) pr_err("Directory '%s' with parent '%s' already present!\n", name, parent->d_name.name); else pr_err("File '%s' in directory '%s' already present!\n", name, parent->d_name.name); dput(dentry); dentry = ERR_PTR(-EEXIST); } if (IS_ERR(dentry)) { inode_unlock(d_inode(parent)); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } return dentry; } static struct dentry *failed_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); dput(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); return ERR_PTR(-ENOMEM); } static struct dentry *end_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); return dentry; } static struct dentry *__debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *proxy_fops, const struct file_operations *real_fops) { struct dentry *dentry; struct inode *inode; if (!(mode & S_IFMT)) mode |= S_IFREG; BUG_ON(!S_ISREG(mode)); dentry = start_creating(name, parent); if (IS_ERR(dentry)) return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry); return ERR_PTR(-EPERM); } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create file '%s'\n", name); return failed_creating(dentry); } inode->i_mode = mode; inode->i_private = data; inode->i_op = &debugfs_file_inode_operations; inode->i_fop = proxy_fops; dentry->d_fsdata = (void *)((unsigned long)real_fops | DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); d_instantiate(dentry, inode); fsnotify_create(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } /** * debugfs_create_file - create a file in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * * This is the basic "create a file" function for debugfs. It allows for a * wide range of flexibility in creating a file, or a directory (if you want * to create a directory, the debugfs_create_dir() function is * recommended to be used instead.) * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. * * NOTE: it's expected that most callers should _ignore_ the errors returned * by this function. Other debugfs functions handle the fact that the "dentry" * passed to them could be an error and they don't crash in that case. * Drivers should generally work fine even if debugfs fails to init anyway. */ struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { return __debugfs_create_file(name, mode, parent, data, fops ? &debugfs_full_proxy_file_operations : &debugfs_noop_file_operations, fops); } EXPORT_SYMBOL_GPL(debugfs_create_file); /** * debugfs_create_file_unsafe - create a file in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * * debugfs_create_file_unsafe() is completely analogous to * debugfs_create_file(), the only difference being that the fops * handed it will not get protected against file removals by the * debugfs core. * * It is your responsibility to protect your struct file_operation * methods against file removals by means of debugfs_file_get() * and debugfs_file_put(). ->open() is still protected by * debugfs though. * * Any struct file_operations defined by means of * DEFINE_DEBUGFS_ATTRIBUTE() is protected against file removals and * thus, may be used here. */ struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { return __debugfs_create_file(name, mode, parent, data, fops ? &debugfs_open_proxy_file_operations : &debugfs_noop_file_operations, fops); } EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe); /** * debugfs_create_file_size - create a file in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * @file_size: initial file size * * This is the basic "create a file" function for debugfs. It allows for a * wide range of flexibility in creating a file, or a directory (if you want * to create a directory, the debugfs_create_dir() function is * recommended to be used instead.) */ void debugfs_create_file_size(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops, loff_t file_size) { struct dentry *de = debugfs_create_file(name, mode, parent, data, fops); if (!IS_ERR(de)) d_inode(de)->i_size = file_size; } EXPORT_SYMBOL_GPL(debugfs_create_file_size); /** * debugfs_create_dir - create a directory in the debugfs filesystem * @name: a pointer to a string containing the name of the directory to * create. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * directory will be created in the root of the debugfs filesystem. * * This function creates a directory in debugfs with the given name. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. * * NOTE: it's expected that most callers should _ignore_ the errors returned * by this function. Other debugfs functions handle the fact that the "dentry" * passed to them could be an error and they don't crash in that case. * Drivers should generally work fine even if debugfs fails to init anyway. */ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) { struct dentry *dentry = start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry); return ERR_PTR(-EPERM); } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create directory '%s'\n", name); return failed_creating(dentry); } inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; inode->i_op = &debugfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_dir); /** * debugfs_create_automount - create automount point in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @f: function to be called when pathname resolution steps on that one. * @data: opaque argument to pass to f(). * * @f should return what ->d_automount() would. */ struct dentry *debugfs_create_automount(const char *name, struct dentry *parent, debugfs_automount_t f, void *data) { struct dentry *dentry = start_creating(name, parent); struct debugfs_fsdata *fsd; struct inode *inode; if (IS_ERR(dentry)) return dentry; fsd = kzalloc(sizeof(*fsd), GFP_KERNEL); if (!fsd) { failed_creating(dentry); return ERR_PTR(-ENOMEM); } fsd->automount = f; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry); kfree(fsd); return ERR_PTR(-EPERM); } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create automount '%s'\n", name); kfree(fsd); return failed_creating(dentry); } make_empty_dir_inode(inode); inode->i_flags |= S_AUTOMOUNT; inode->i_private = data; dentry->d_fsdata = fsd; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL(debugfs_create_automount); /** * debugfs_create_symlink- create a symbolic link in the debugfs filesystem * @name: a pointer to a string containing the name of the symbolic link to * create. * @parent: a pointer to the parent dentry for this symbolic link. This * should be a directory dentry if set. If this parameter is NULL, * then the symbolic link will be created in the root of the debugfs * filesystem. * @target: a pointer to a string containing the path to the target of the * symbolic link. * * This function creates a symbolic link with the given name in debugfs that * links to the given target path. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the symbolic * link is to be removed (no automatic cleanup happens if your module is * unloaded, you are responsible here.) If an error occurs, ERR_PTR(-ERROR) * will be returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, const char *target) { struct dentry *dentry; struct inode *inode; char *link = kstrdup(target, GFP_KERNEL); if (!link) return ERR_PTR(-ENOMEM); dentry = start_creating(name, parent); if (IS_ERR(dentry)) { kfree(link); return dentry; } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create symlink '%s'\n", name); kfree(link); return failed_creating(dentry); } inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_op = &debugfs_symlink_inode_operations; inode->i_link = link; d_instantiate(dentry, inode); return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_symlink); static void __debugfs_file_removed(struct dentry *dentry) { struct debugfs_fsdata *fsd; /* * Paired with the closing smp_mb() implied by a successful * cmpxchg() in debugfs_file_get(): either * debugfs_file_get() must see a dead dentry or we must see a * debugfs_fsdata instance at ->d_fsdata here (or both). */ smp_mb(); fsd = READ_ONCE(dentry->d_fsdata); if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) return; /* if we hit zero, just wait for all to finish */ if (!refcount_dec_and_test(&fsd->active_users)) { wait_for_completion(&fsd->active_users_drained); return; } /* if we didn't hit zero, try to cancel any we can */ while (refcount_read(&fsd->active_users)) { struct debugfs_cancellation *c; /* * Lock the cancellations. Note that the cancellations * structs are meant to be on the stack, so we need to * ensure we either use them here or don't touch them, * and debugfs_leave_cancellation() will wait for this * to be finished processing before exiting one. It may * of course win and remove the cancellation, but then * chances are we never even got into this bit, we only * do if the refcount isn't zero already. */ mutex_lock(&fsd->cancellations_mtx); while ((c = list_first_entry_or_null(&fsd->cancellations, typeof(*c), list))) { list_del_init(&c->list); c->cancel(dentry, c->cancel_data); } mutex_unlock(&fsd->cancellations_mtx); wait_for_completion(&fsd->active_users_drained); } } static void remove_one(struct dentry *victim) { if (d_is_reg(victim)) __debugfs_file_removed(victim); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } /** * debugfs_remove - recursively removes a directory * @dentry: a pointer to a the dentry of the directory to be removed. If this * parameter is NULL or an error value, nothing will be done. * * This function recursively removes a directory tree in debugfs that * was previously created with a call to another debugfs function * (like debugfs_create_file() or variants thereof.) * * This function is required to be called in order for the file to be * removed, no automatic cleanup of files will happen when a module is * removed, you are responsible here. */ void debugfs_remove(struct dentry *dentry) { if (IS_ERR_OR_NULL(dentry)) return; simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); simple_recursive_removal(dentry, remove_one); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } EXPORT_SYMBOL_GPL(debugfs_remove); /** * debugfs_lookup_and_remove - lookup a directory or file and recursively remove it * @name: a pointer to a string containing the name of the item to look up. * @parent: a pointer to the parent dentry of the item. * * This is the equlivant of doing something like * debugfs_remove(debugfs_lookup(..)) but with the proper reference counting * handled for the directory being looked up. */ void debugfs_lookup_and_remove(const char *name, struct dentry *parent) { struct dentry *dentry; dentry = debugfs_lookup(name, parent); if (!dentry) return; debugfs_remove(dentry); dput(dentry); } EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove); /** * debugfs_rename - rename a file/directory in the debugfs filesystem * @old_dir: a pointer to the parent dentry for the renamed object. This * should be a directory dentry. * @old_dentry: dentry of an object to be renamed. * @new_dir: a pointer to the parent dentry where the object should be * moved. This should be a directory dentry. * @new_name: a pointer to a string containing the target name. * * This function renames a file/directory in debugfs. The target must not * exist for rename to succeed. * * This function will return a pointer to old_dentry (which is updated to * reflect renaming) if it succeeds. If an error occurs, ERR_PTR(-ERROR) * will be returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, struct dentry *new_dir, const char *new_name) { int error; struct dentry *dentry = NULL, *trap; struct name_snapshot old_name; if (IS_ERR(old_dir)) return old_dir; if (IS_ERR(new_dir)) return new_dir; if (IS_ERR_OR_NULL(old_dentry)) return old_dentry; trap = lock_rename(new_dir, old_dir); /* Source or destination directories don't exist? */ if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir)) goto exit; /* Source does not exist, cyclic rename, or mountpoint? */ if (d_really_is_negative(old_dentry) || old_dentry == trap || d_mountpoint(old_dentry)) goto exit; dentry = lookup_one_len(new_name, new_dir, strlen(new_name)); /* Lookup failed, cyclic rename or target exists? */ if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry)) goto exit; take_dentry_name_snapshot(&old_name, old_dentry); error = simple_rename(&nop_mnt_idmap, d_inode(old_dir), old_dentry, d_inode(new_dir), dentry, 0); if (error) { release_dentry_name_snapshot(&old_name); goto exit; } d_move(old_dentry, dentry); fsnotify_move(d_inode(old_dir), d_inode(new_dir), &old_name.name, d_is_dir(old_dentry), NULL, old_dentry); release_dentry_name_snapshot(&old_name); unlock_rename(new_dir, old_dir); dput(dentry); return old_dentry; exit: if (dentry && !IS_ERR(dentry)) dput(dentry); unlock_rename(new_dir, old_dir); if (IS_ERR(dentry)) return dentry; return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(debugfs_rename); /** * debugfs_initialized - Tells whether debugfs has been registered */ bool debugfs_initialized(void) { return debugfs_registered; } EXPORT_SYMBOL_GPL(debugfs_initialized); static int __init debugfs_kernel(char *str) { if (str) { if (!strcmp(str, "on")) debugfs_allow = DEBUGFS_ALLOW_API | DEBUGFS_ALLOW_MOUNT; else if (!strcmp(str, "no-mount")) debugfs_allow = DEBUGFS_ALLOW_API; else if (!strcmp(str, "off")) debugfs_allow = 0; } return 0; } early_param("debugfs", debugfs_kernel); static int __init debugfs_init(void) { int retval; if (!(debugfs_allow & DEBUGFS_ALLOW_MOUNT)) return -EPERM; retval = sysfs_create_mount_point(kernel_kobj, "debug"); if (retval) return retval; retval = register_filesystem(&debug_fs_type); if (retval) sysfs_remove_mount_point(kernel_kobj, "debug"); else debugfs_registered = true; return retval; } core_initcall(debugfs_init);
26 25 10 23 19 23 17 17 23 3 3 3 3 3 3 4 4 4 4 4 23 18 10 2 2 4 2 3 9 9 8 4 46 46 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 /* * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/slab.h> #include <linux/types.h> #include <linux/rbtree.h> #include <linux/bitops.h> #include <linux/export.h> #include "rds.h" /* * This file implements the receive side of the unconventional congestion * management in RDS. * * Messages waiting in the receive queue on the receiving socket are accounted * against the sockets SO_RCVBUF option value. Only the payload bytes in the * message are accounted for. If the number of bytes queued equals or exceeds * rcvbuf then the socket is congested. All sends attempted to this socket's * address should return block or return -EWOULDBLOCK. * * Applications are expected to be reasonably tuned such that this situation * very rarely occurs. An application encountering this "back-pressure" is * considered a bug. * * This is implemented by having each node maintain bitmaps which indicate * which ports on bound addresses are congested. As the bitmap changes it is * sent through all the connections which terminate in the local address of the * bitmap which changed. * * The bitmaps are allocated as connections are brought up. This avoids * allocation in the interrupt handling path which queues messages on sockets. * The dense bitmaps let transports send the entire bitmap on any bitmap change * reasonably efficiently. This is much easier to implement than some * finer-grained communication of per-port congestion. The sender does a very * inexpensive bit test to test if the port it's about to send to is congested * or not. */ /* * Interaction with poll is a tad tricky. We want all processes stuck in * poll to wake up and check whether a congested destination became uncongested. * The really sad thing is we have no idea which destinations the application * wants to send to - we don't even know which rds_connections are involved. * So until we implement a more flexible rds poll interface, we have to make * do with this: * We maintain a global counter that is incremented each time a congestion map * update is received. Each rds socket tracks this value, and if rds_poll * finds that the saved generation number is smaller than the global generation * number, it wakes up the process. */ static atomic_t rds_cong_generation = ATOMIC_INIT(0); /* * Congestion monitoring */ static LIST_HEAD(rds_cong_monitor); static DEFINE_RWLOCK(rds_cong_monitor_lock); /* * Yes, a global lock. It's used so infrequently that it's worth keeping it * global to simplify the locking. It's only used in the following * circumstances: * * - on connection buildup to associate a conn with its maps * - on map changes to inform conns of a new map to send * * It's sadly ordered under the socket callback lock and the connection lock. * Receive paths can mark ports congested from interrupt context so the * lock masks interrupts. */ static DEFINE_SPINLOCK(rds_cong_lock); static struct rb_root rds_cong_tree = RB_ROOT; static struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr, struct rds_cong_map *insert) { struct rb_node **p = &rds_cong_tree.rb_node; struct rb_node *parent = NULL; struct rds_cong_map *map; while (*p) { int diff; parent = *p; map = rb_entry(parent, struct rds_cong_map, m_rb_node); diff = rds_addr_cmp(addr, &map->m_addr); if (diff < 0) p = &(*p)->rb_left; else if (diff > 0) p = &(*p)->rb_right; else return map; } if (insert) { rb_link_node(&insert->m_rb_node, parent, p); rb_insert_color(&insert->m_rb_node, &rds_cong_tree); } return NULL; } /* * There is only ever one bitmap for any address. Connections try and allocate * these bitmaps in the process getting pointers to them. The bitmaps are only * ever freed as the module is removed after all connections have been freed. */ static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr) { struct rds_cong_map *map; struct rds_cong_map *ret = NULL; unsigned long zp; unsigned long i; unsigned long flags; map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); if (!map) return NULL; map->m_addr = *addr; init_waitqueue_head(&map->m_waitq); INIT_LIST_HEAD(&map->m_conn_list); for (i = 0; i < RDS_CONG_MAP_PAGES; i++) { zp = get_zeroed_page(GFP_KERNEL); if (zp == 0) goto out; map->m_page_addrs[i] = zp; } spin_lock_irqsave(&rds_cong_lock, flags); ret = rds_cong_tree_walk(addr, map); spin_unlock_irqrestore(&rds_cong_lock, flags); if (!ret) { ret = map; map = NULL; } out: if (map) { for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) free_page(map->m_page_addrs[i]); kfree(map); } rdsdebug("map %p for addr %pI6c\n", ret, addr); return ret; } /* * Put the conn on its local map's list. This is called when the conn is * really added to the hash. It's nested under the rds_conn_lock, sadly. */ void rds_cong_add_conn(struct rds_connection *conn) { unsigned long flags; rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong); spin_lock_irqsave(&rds_cong_lock, flags); list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list); spin_unlock_irqrestore(&rds_cong_lock, flags); } void rds_cong_remove_conn(struct rds_connection *conn) { unsigned long flags; rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong); spin_lock_irqsave(&rds_cong_lock, flags); list_del_init(&conn->c_map_item); spin_unlock_irqrestore(&rds_cong_lock, flags); } int rds_cong_get_maps(struct rds_connection *conn) { conn->c_lcong = rds_cong_from_addr(&conn->c_laddr); conn->c_fcong = rds_cong_from_addr(&conn->c_faddr); if (!(conn->c_lcong && conn->c_fcong)) return -ENOMEM; return 0; } void rds_cong_queue_updates(struct rds_cong_map *map) { struct rds_connection *conn; unsigned long flags; spin_lock_irqsave(&rds_cong_lock, flags); list_for_each_entry(conn, &map->m_conn_list, c_map_item) { struct rds_conn_path *cp = &conn->c_path[0]; rcu_read_lock(); if (!test_and_set_bit(0, &conn->c_map_queued) && !rds_destroy_pending(cp->cp_conn)) { rds_stats_inc(s_cong_update_queued); /* We cannot inline the call to rds_send_xmit() here * for two reasons (both pertaining to a TCP transport): * 1. When we get here from the receive path, we * are already holding the sock_lock (held by * tcp_v4_rcv()). So inlining calls to * tcp_setsockopt and/or tcp_sendmsg will deadlock * when it tries to get the sock_lock()) * 2. Interrupts are masked so that we can mark the * port congested from both send and recv paths. * (See comment around declaration of rdc_cong_lock). * An attempt to get the sock_lock() here will * therefore trigger warnings. * Defer the xmit to rds_send_worker() instead. */ queue_delayed_work(rds_wq, &cp->cp_send_w, 0); } rcu_read_unlock(); } spin_unlock_irqrestore(&rds_cong_lock, flags); } void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask) { rdsdebug("waking map %p for %pI4\n", map, &map->m_addr); rds_stats_inc(s_cong_update_received); atomic_inc(&rds_cong_generation); if (waitqueue_active(&map->m_waitq)) wake_up(&map->m_waitq); if (waitqueue_active(&rds_poll_waitq)) wake_up_all(&rds_poll_waitq); if (portmask && !list_empty(&rds_cong_monitor)) { unsigned long flags; struct rds_sock *rs; read_lock_irqsave(&rds_cong_monitor_lock, flags); list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) { spin_lock(&rs->rs_lock); rs->rs_cong_notify |= (rs->rs_cong_mask & portmask); rs->rs_cong_mask &= ~portmask; spin_unlock(&rs->rs_lock); if (rs->rs_cong_notify) rds_wake_sk_sleep(rs); } read_unlock_irqrestore(&rds_cong_monitor_lock, flags); } } EXPORT_SYMBOL_GPL(rds_cong_map_updated); int rds_cong_updated_since(unsigned long *recent) { unsigned long gen = atomic_read(&rds_cong_generation); if (likely(*recent == gen)) return 0; *recent = gen; return 1; } /* * We're called under the locking that protects the sockets receive buffer * consumption. This makes it a lot easier for the caller to only call us * when it knows that an existing set bit needs to be cleared, and vice versa. * We can't block and we need to deal with concurrent sockets working against * the same per-address map. */ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) { unsigned long i; unsigned long off; rdsdebug("setting congestion for %pI4:%u in map %p\n", &map->m_addr, ntohs(port), map); i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; set_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) { unsigned long i; unsigned long off; rdsdebug("clearing congestion for %pI4:%u in map %p\n", &map->m_addr, ntohs(port), map); i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; clear_bit_le(off, (void *)map->m_page_addrs[i]); } static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) { unsigned long i; unsigned long off; i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; return test_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_add_socket(struct rds_sock *rs) { unsigned long flags; write_lock_irqsave(&rds_cong_monitor_lock, flags); if (list_empty(&rs->rs_cong_list)) list_add(&rs->rs_cong_list, &rds_cong_monitor); write_unlock_irqrestore(&rds_cong_monitor_lock, flags); } void rds_cong_remove_socket(struct rds_sock *rs) { unsigned long flags; struct rds_cong_map *map; write_lock_irqsave(&rds_cong_monitor_lock, flags); list_del_init(&rs->rs_cong_list); write_unlock_irqrestore(&rds_cong_monitor_lock, flags); /* update congestion map for now-closed port */ spin_lock_irqsave(&rds_cong_lock, flags); map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL); spin_unlock_irqrestore(&rds_cong_lock, flags); if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { rds_cong_clear_bit(map, rs->rs_bound_port); rds_cong_queue_updates(map); } } int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs) { if (!rds_cong_test_bit(map, port)) return 0; if (nonblock) { if (rs && rs->rs_cong_monitor) { unsigned long flags; /* It would have been nice to have an atomic set_bit on * a uint64_t. */ spin_lock_irqsave(&rs->rs_lock, flags); rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port)); spin_unlock_irqrestore(&rs->rs_lock, flags); /* Test again - a congestion update may have arrived in * the meantime. */ if (!rds_cong_test_bit(map, port)) return 0; } rds_stats_inc(s_cong_send_error); return -ENOBUFS; } rds_stats_inc(s_cong_send_blocked); rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port)); return wait_event_interruptible(map->m_waitq, !rds_cong_test_bit(map, port)); } void rds_cong_exit(void) { struct rb_node *node; struct rds_cong_map *map; unsigned long i; while ((node = rb_first(&rds_cong_tree))) { map = rb_entry(node, struct rds_cong_map, m_rb_node); rdsdebug("freeing map %p\n", map); rb_erase(&map->m_rb_node, &rds_cong_tree); for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) free_page(map->m_page_addrs[i]); kfree(map); } } /* * Allocate a RDS message containing a congestion update. */ struct rds_message *rds_cong_update_alloc(struct rds_connection *conn) { struct rds_cong_map *map = conn->c_lcong; struct rds_message *rm; rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES); if (!IS_ERR(rm)) rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP; return rm; }
34 2471 3142 1571 1561 55 130 3140 3139 2458 911 1013 195 1606 41 21 20 41 41 77 78 77 78 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NETFILTER_H #define __LINUX_NETFILTER_H #include <linux/init.h> #include <linux/skbuff.h> #include <linux/net.h> #include <linux/if.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/wait.h> #include <linux/list.h> #include <linux/static_key.h> #include <linux/module.h> #include <linux/netfilter_defs.h> #include <linux/netdevice.h> #include <linux/sockptr.h> #include <net/net_namespace.h> static inline int NF_DROP_GETERR(int verdict) { return -(verdict >> NF_VERDICT_QBITS); } static __always_inline int NF_DROP_REASON(struct sk_buff *skb, enum skb_drop_reason reason, u32 err) { BUILD_BUG_ON(err > 0xffff); kfree_skb_reason(skb, reason); return ((err << 16) | NF_STOLEN); } static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *a2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul1 = (const unsigned long *)a1; const unsigned long *ul2 = (const unsigned long *)a2; return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL; #else return a1->all[0] == a2->all[0] && a1->all[1] == a2->all[1] && a1->all[2] == a2->all[2] && a1->all[3] == a2->all[3]; #endif } static inline void nf_inet_addr_mask(const union nf_inet_addr *a1, union nf_inet_addr *result, const union nf_inet_addr *mask) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ua = (const unsigned long *)a1; unsigned long *ur = (unsigned long *)result; const unsigned long *um = (const unsigned long *)mask; ur[0] = ua[0] & um[0]; ur[1] = ua[1] & um[1]; #else result->all[0] = a1->all[0] & mask->all[0]; result->all[1] = a1->all[1] & mask->all[1]; result->all[2] = a1->all[2] & mask->all[2]; result->all[3] = a1->all[3] & mask->all[3]; #endif } int netfilter_init(void); struct sk_buff; struct nf_hook_ops; struct sock; struct nf_hook_state { u8 hook; u8 pf; struct net_device *in; struct net_device *out; struct sock *sk; struct net *net; int (*okfn)(struct net *, struct sock *, struct sk_buff *); }; typedef unsigned int nf_hookfn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); enum nf_hook_ops_type { NF_HOOK_OP_UNDEFINED, NF_HOOK_OP_NF_TABLES, NF_HOOK_OP_BPF, }; struct nf_hook_ops { /* User fills in from here down. */ nf_hookfn *hook; struct net_device *dev; void *priv; u8 pf; enum nf_hook_ops_type hook_ops_type:8; unsigned int hooknum; /* Hooks are ordered in ascending priority. */ int priority; }; struct nf_hook_entry { nf_hookfn *hook; void *priv; }; struct nf_hook_entries_rcu_head { struct rcu_head head; void *allocation; }; struct nf_hook_entries { u16 num_hook_entries; /* padding */ struct nf_hook_entry hooks[]; /* trailer: pointers to original orig_ops of each hook, * followed by rcu_head and scratch space used for freeing * the structure via call_rcu. * * This is not part of struct nf_hook_entry since its only * needed in slow path (hook register/unregister): * const struct nf_hook_ops *orig_ops[] * * For the same reason, we store this at end -- its * only needed when a hook is deleted, not during * packet path processing: * struct nf_hook_entries_rcu_head head */ }; #ifdef CONFIG_NETFILTER static inline struct nf_hook_ops **nf_hook_entries_get_hook_ops(const struct nf_hook_entries *e) { unsigned int n = e->num_hook_entries; const void *hook_end; hook_end = &e->hooks[n]; /* this is *past* ->hooks[]! */ return (struct nf_hook_ops **)hook_end; } static inline int nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb, struct nf_hook_state *state) { return entry->hook(entry->priv, skb, state); } static inline void nf_hook_state_init(struct nf_hook_state *p, unsigned int hook, u_int8_t pf, struct net_device *indev, struct net_device *outdev, struct sock *sk, struct net *net, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { p->hook = hook; p->pf = pf; p->in = indev; p->out = outdev; p->sk = sk; p->net = net; p->okfn = okfn; } struct nf_sockopt_ops { struct list_head list; u_int8_t pf; /* Non-inclusive ranges: use 0/0/NULL to never get called. */ int set_optmin; int set_optmax; int (*set)(struct sock *sk, int optval, sockptr_t arg, unsigned int len); int get_optmin; int get_optmax; int (*get)(struct sock *sk, int optval, void __user *user, int *len); /* Use the module struct to lock set/get code in place */ struct module *owner; }; /* Function to register/unregister hook points. */ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops); void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops); int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int n); void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int n); /* Functions to register get/setsockopt ranges (non-inclusive). You need to check permissions yourself! */ int nf_register_sockopt(struct nf_sockopt_ops *reg); void nf_unregister_sockopt(struct nf_sockopt_ops *reg); #ifdef CONFIG_JUMP_LABEL extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #endif int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, const struct nf_hook_entries *e, unsigned int i); void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, const struct nf_hook_entries *e); /** * nf_hook - call a netfilter hook * * Returns 1 if the hook has allowed the packet to pass. The function * okfn must be invoked by the caller in this case. Any other return * value indicates the packet has been consumed by the hook. */ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { struct nf_hook_entries *hook_head = NULL; int ret = 1; #ifdef CONFIG_JUMP_LABEL if (__builtin_constant_p(pf) && __builtin_constant_p(hook) && !static_key_false(&nf_hooks_needed[pf][hook])) return 1; #endif rcu_read_lock(); switch (pf) { case NFPROTO_IPV4: hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); break; case NFPROTO_IPV6: hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); break; case NFPROTO_ARP: #ifdef CONFIG_NETFILTER_FAMILY_ARP if (WARN_ON_ONCE(hook >= ARRAY_SIZE(net->nf.hooks_arp))) break; hook_head = rcu_dereference(net->nf.hooks_arp[hook]); #endif break; case NFPROTO_BRIDGE: #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); #endif break; default: WARN_ON_ONCE(1); break; } if (hook_head) { struct nf_hook_state state; nf_hook_state_init(&state, hook, pf, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state, hook_head, 0); } rcu_read_unlock(); return ret; } /* Activate hook; either okfn or kfree_skb called, unless a hook returns NF_STOLEN (in which case, it's up to the hook to deal with the consequences). Returns -ERRNO if packet dropped. Zero means queued, stolen or accepted. */ /* RR: > I don't want nf_hook to return anything because people might forget > about async and trust the return value to mean "packet was ok". AK: Just document it clearly, then you can expect some sense from kernel coders :) */ static inline int NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *), bool cond) { int ret; if (!cond || ((ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn)) == 1)) ret = okfn(net, sk, skb); return ret; } static inline int NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn); if (ret == 1) ret = okfn(net, sk, skb); return ret; } static inline void NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct list_head *head, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { struct nf_hook_entries *hook_head = NULL; #ifdef CONFIG_JUMP_LABEL if (__builtin_constant_p(pf) && __builtin_constant_p(hook) && !static_key_false(&nf_hooks_needed[pf][hook])) return; #endif rcu_read_lock(); switch (pf) { case NFPROTO_IPV4: hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); break; case NFPROTO_IPV6: hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); break; default: WARN_ON_ONCE(1); break; } if (hook_head) { struct nf_hook_state state; nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn); nf_hook_slow_list(head, &state, hook_head); } rcu_read_unlock(); } /* Call setsockopt() */ int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, sockptr_t opt, unsigned int len); int nf_getsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, int *len); struct flowi; struct nf_queue_entry; __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol, unsigned short family); __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, u_int8_t protocol, unsigned short family); int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict, unsigned short family); int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry); #include <net/flow.h> struct nf_conn; enum nf_nat_manip_type; struct nlattr; enum ip_conntrack_dir; struct nf_nat_hook { int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr); void (*decode_session)(struct sk_buff *skb, struct flowi *fl); unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct, enum nf_nat_manip_type mtype, enum ip_conntrack_dir dir); void (*remove_nat_bysrc)(struct nf_conn *ct); }; extern const struct nf_nat_hook __rcu *nf_nat_hook; static inline void nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) { #if IS_ENABLED(CONFIG_NF_NAT) const struct nf_nat_hook *nat_hook; rcu_read_lock(); nat_hook = rcu_dereference(nf_nat_hook); if (nat_hook && nat_hook->decode_session) nat_hook->decode_session(skb, fl); rcu_read_unlock(); #endif } #else /* !CONFIG_NETFILTER */ static inline int NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *), bool cond) { return okfn(net, sk, skb); } static inline int NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { return okfn(net, sk, skb); } static inline void NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct list_head *head, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { /* nothing to do */ } static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { return 1; } struct flowi; static inline void nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) { } #endif /*CONFIG_NETFILTER*/ #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <linux/netfilter/nf_conntrack_zones_common.h> void nf_ct_attach(struct sk_buff *, const struct sk_buff *); void nf_ct_set_closing(struct nf_conntrack *nfct); struct nf_conntrack_tuple; bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb); #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} static inline void nf_ct_set_closing(struct nf_conntrack *nfct) {} struct nf_conntrack_tuple; static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { return false; } #endif struct nf_conn; enum ip_conntrack_info; struct nf_ct_hook { int (*update)(struct net *net, struct sk_buff *skb); void (*destroy)(struct nf_conntrack *); bool (*get_tuple_skb)(struct nf_conntrack_tuple *, const struct sk_buff *); void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); void (*set_closing)(struct nf_conntrack *nfct); }; extern const struct nf_ct_hook __rcu *nf_ct_hook; struct nlattr; struct nfnl_ct_hook { size_t (*build_size)(const struct nf_conn *ct); int (*build)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, u_int16_t ct_attr, u_int16_t ct_info_attr); int (*parse)(const struct nlattr *attr, struct nf_conn *ct); int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct, u32 portid, u32 report); void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off); }; extern const struct nfnl_ct_hook __rcu *nfnl_ct_hook; struct nf_defrag_hook { struct module *owner; int (*enable)(struct net *net); void (*disable)(struct net *net); }; extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook; extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook; /* * nf_skb_duplicated - TEE target has sent a packet * * When a xtables target sends a packet, the OUTPUT and POSTROUTING * hooks are traversed again, i.e. nft and xtables are invoked recursively. * * This is used by xtables TEE target to prevent the duplicated skb from * being duplicated again. */ DECLARE_PER_CPU(bool, nf_skb_duplicated); /* * Contains bitmask of ctnetlink event subscribers, if any. * Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag. */ extern u8 nf_ctnetlink_has_listener; #endif /*__LINUX_NETFILTER_H*/
38 5 4 94 8 32 32 2 38 38 2 37 38 5 5 5 5 38 38 10 38 38 7 2 5 76 38 38 66 67 66 66 66 67 67 67 67 67 67 67 67 68 1 68 1 68 67 67 67 67 67 67 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ #include <linux/string.h> #include <linux/slab.h> #include <linux/buffer_head.h> #include <asm/unaligned.h> #include "exfat_raw.h" #include "exfat_fs.h" /* Upcase table macro */ #define EXFAT_NUM_UPCASE (2918) #define UTBL_COUNT (0x10000) /* * Upcase table in compressed format (7.2.5.1 Recommended Up-case Table * in exfat specification, See: * https://docs.microsoft.com/en-us/windows/win32/fileio/exfat-specification). */ static const unsigned short uni_def_upcase[EXFAT_NUM_UPCASE] = { 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00f7, 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178, 0x0100, 0x0100, 0x0102, 0x0102, 0x0104, 0x0104, 0x0106, 0x0106, 0x0108, 0x0108, 0x010a, 0x010a, 0x010c, 0x010c, 0x010e, 0x010e, 0x0110, 0x0110, 0x0112, 0x0112, 0x0114, 0x0114, 0x0116, 0x0116, 0x0118, 0x0118, 0x011a, 0x011a, 0x011c, 0x011c, 0x011e, 0x011e, 0x0120, 0x0120, 0x0122, 0x0122, 0x0124, 0x0124, 0x0126, 0x0126, 0x0128, 0x0128, 0x012a, 0x012a, 0x012c, 0x012c, 0x012e, 0x012e, 0x0130, 0x0131, 0x0132, 0x0132, 0x0134, 0x0134, 0x0136, 0x0136, 0x0138, 0x0139, 0x0139, 0x013b, 0x013b, 0x013d, 0x013d, 0x013f, 0x013f, 0x0141, 0x0141, 0x0143, 0x0143, 0x0145, 0x0145, 0x0147, 0x0147, 0x0149, 0x014a, 0x014a, 0x014c, 0x014c, 0x014e, 0x014e, 0x0150, 0x0150, 0x0152, 0x0152, 0x0154, 0x0154, 0x0156, 0x0156, 0x0158, 0x0158, 0x015a, 0x015a, 0x015c, 0x015c, 0x015e, 0x015e, 0x0160, 0x0160, 0x0162, 0x0162, 0x0164, 0x0164, 0x0166, 0x0166, 0x0168, 0x0168, 0x016a, 0x016a, 0x016c, 0x016c, 0x016e, 0x016e, 0x0170, 0x0170, 0x0172, 0x0172, 0x0174, 0x0174, 0x0176, 0x0176, 0x0178, 0x0179, 0x0179, 0x017b, 0x017b, 0x017d, 0x017d, 0x017f, 0x0243, 0x0181, 0x0182, 0x0182, 0x0184, 0x0184, 0x0186, 0x0187, 0x0187, 0x0189, 0x018a, 0x018b, 0x018b, 0x018d, 0x018e, 0x018f, 0x0190, 0x0191, 0x0191, 0x0193, 0x0194, 0x01f6, 0x0196, 0x0197, 0x0198, 0x0198, 0x023d, 0x019b, 0x019c, 0x019d, 0x0220, 0x019f, 0x01a0, 0x01a0, 0x01a2, 0x01a2, 0x01a4, 0x01a4, 0x01a6, 0x01a7, 0x01a7, 0x01a9, 0x01aa, 0x01ab, 0x01ac, 0x01ac, 0x01ae, 0x01af, 0x01af, 0x01b1, 0x01b2, 0x01b3, 0x01b3, 0x01b5, 0x01b5, 0x01b7, 0x01b8, 0x01b8, 0x01ba, 0x01bb, 0x01bc, 0x01bc, 0x01be, 0x01f7, 0x01c0, 0x01c1, 0x01c2, 0x01c3, 0x01c4, 0x01c5, 0x01c4, 0x01c7, 0x01c8, 0x01c7, 0x01ca, 0x01cb, 0x01ca, 0x01cd, 0x01cd, 0x01cf, 0x01cf, 0x01d1, 0x01d1, 0x01d3, 0x01d3, 0x01d5, 0x01d5, 0x01d7, 0x01d7, 0x01d9, 0x01d9, 0x01db, 0x01db, 0x018e, 0x01de, 0x01de, 0x01e0, 0x01e0, 0x01e2, 0x01e2, 0x01e4, 0x01e4, 0x01e6, 0x01e6, 0x01e8, 0x01e8, 0x01ea, 0x01ea, 0x01ec, 0x01ec, 0x01ee, 0x01ee, 0x01f0, 0x01f1, 0x01f2, 0x01f1, 0x01f4, 0x01f4, 0x01f6, 0x01f7, 0x01f8, 0x01f8, 0x01fa, 0x01fa, 0x01fc, 0x01fc, 0x01fe, 0x01fe, 0x0200, 0x0200, 0x0202, 0x0202, 0x0204, 0x0204, 0x0206, 0x0206, 0x0208, 0x0208, 0x020a, 0x020a, 0x020c, 0x020c, 0x020e, 0x020e, 0x0210, 0x0210, 0x0212, 0x0212, 0x0214, 0x0214, 0x0216, 0x0216, 0x0218, 0x0218, 0x021a, 0x021a, 0x021c, 0x021c, 0x021e, 0x021e, 0x0220, 0x0221, 0x0222, 0x0222, 0x0224, 0x0224, 0x0226, 0x0226, 0x0228, 0x0228, 0x022a, 0x022a, 0x022c, 0x022c, 0x022e, 0x022e, 0x0230, 0x0230, 0x0232, 0x0232, 0x0234, 0x0235, 0x0236, 0x0237, 0x0238, 0x0239, 0x2c65, 0x023b, 0x023b, 0x023d, 0x2c66, 0x023f, 0x0240, 0x0241, 0x0241, 0x0243, 0x0244, 0x0245, 0x0246, 0x0246, 0x0248, 0x0248, 0x024a, 0x024a, 0x024c, 0x024c, 0x024e, 0x024e, 0x0250, 0x0251, 0x0252, 0x0181, 0x0186, 0x0255, 0x0189, 0x018a, 0x0258, 0x018f, 0x025a, 0x0190, 0x025c, 0x025d, 0x025e, 0x025f, 0x0193, 0x0261, 0x0262, 0x0194, 0x0264, 0x0265, 0x0266, 0x0267, 0x0197, 0x0196, 0x026a, 0x2c62, 0x026c, 0x026d, 0x026e, 0x019c, 0x0270, 0x0271, 0x019d, 0x0273, 0x0274, 0x019f, 0x0276, 0x0277, 0x0278, 0x0279, 0x027a, 0x027b, 0x027c, 0x2c64, 0x027e, 0x027f, 0x01a6, 0x0281, 0x0282, 0x01a9, 0x0284, 0x0285, 0x0286, 0x0287, 0x01ae, 0x0244, 0x01b1, 0x01b2, 0x0245, 0x028d, 0x028e, 0x028f, 0x0290, 0x0291, 0x01b7, 0x0293, 0x0294, 0x0295, 0x0296, 0x0297, 0x0298, 0x0299, 0x029a, 0x029b, 0x029c, 0x029d, 0x029e, 0x029f, 0x02a0, 0x02a1, 0x02a2, 0x02a3, 0x02a4, 0x02a5, 0x02a6, 0x02a7, 0x02a8, 0x02a9, 0x02aa, 0x02ab, 0x02ac, 0x02ad, 0x02ae, 0x02af, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x02b5, 0x02b6, 0x02b7, 0x02b8, 0x02b9, 0x02ba, 0x02bb, 0x02bc, 0x02bd, 0x02be, 0x02bf, 0x02c0, 0x02c1, 0x02c2, 0x02c3, 0x02c4, 0x02c5, 0x02c6, 0x02c7, 0x02c8, 0x02c9, 0x02ca, 0x02cb, 0x02cc, 0x02cd, 0x02ce, 0x02cf, 0x02d0, 0x02d1, 0x02d2, 0x02d3, 0x02d4, 0x02d5, 0x02d6, 0x02d7, 0x02d8, 0x02d9, 0x02da, 0x02db, 0x02dc, 0x02dd, 0x02de, 0x02df, 0x02e0, 0x02e1, 0x02e2, 0x02e3, 0x02e4, 0x02e5, 0x02e6, 0x02e7, 0x02e8, 0x02e9, 0x02ea, 0x02eb, 0x02ec, 0x02ed, 0x02ee, 0x02ef, 0x02f0, 0x02f1, 0x02f2, 0x02f3, 0x02f4, 0x02f5, 0x02f6, 0x02f7, 0x02f8, 0x02f9, 0x02fa, 0x02fb, 0x02fc, 0x02fd, 0x02fe, 0x02ff, 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 0x0307, 0x0308, 0x0309, 0x030a, 0x030b, 0x030c, 0x030d, 0x030e, 0x030f, 0x0310, 0x0311, 0x0312, 0x0313, 0x0314, 0x0315, 0x0316, 0x0317, 0x0318, 0x0319, 0x031a, 0x031b, 0x031c, 0x031d, 0x031e, 0x031f, 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 0x0327, 0x0328, 0x0329, 0x032a, 0x032b, 0x032c, 0x032d, 0x032e, 0x032f, 0x0330, 0x0331, 0x0332, 0x0333, 0x0334, 0x0335, 0x0336, 0x0337, 0x0338, 0x0339, 0x033a, 0x033b, 0x033c, 0x033d, 0x033e, 0x033f, 0x0340, 0x0341, 0x0342, 0x0343, 0x0344, 0x0345, 0x0346, 0x0347, 0x0348, 0x0349, 0x034a, 0x034b, 0x034c, 0x034d, 0x034e, 0x034f, 0x0350, 0x0351, 0x0352, 0x0353, 0x0354, 0x0355, 0x0356, 0x0357, 0x0358, 0x0359, 0x035a, 0x035b, 0x035c, 0x035d, 0x035e, 0x035f, 0x0360, 0x0361, 0x0362, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367, 0x0368, 0x0369, 0x036a, 0x036b, 0x036c, 0x036d, 0x036e, 0x036f, 0x0370, 0x0371, 0x0372, 0x0373, 0x0374, 0x0375, 0x0376, 0x0377, 0x0378, 0x0379, 0x037a, 0x03fd, 0x03fe, 0x03ff, 0x037e, 0x037f, 0x0380, 0x0381, 0x0382, 0x0383, 0x0384, 0x0385, 0x0386, 0x0387, 0x0388, 0x0389, 0x038a, 0x038b, 0x038c, 0x038d, 0x038e, 0x038f, 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, 0x03a0, 0x03a1, 0x03a2, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x0386, 0x0388, 0x0389, 0x038a, 0x03b0, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, 0x03a0, 0x03a1, 0x03a3, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x038c, 0x038e, 0x038f, 0x03cf, 0x03d0, 0x03d1, 0x03d2, 0x03d3, 0x03d4, 0x03d5, 0x03d6, 0x03d7, 0x03d8, 0x03d8, 0x03da, 0x03da, 0x03dc, 0x03dc, 0x03de, 0x03de, 0x03e0, 0x03e0, 0x03e2, 0x03e2, 0x03e4, 0x03e4, 0x03e6, 0x03e6, 0x03e8, 0x03e8, 0x03ea, 0x03ea, 0x03ec, 0x03ec, 0x03ee, 0x03ee, 0x03f0, 0x03f1, 0x03f9, 0x03f3, 0x03f4, 0x03f5, 0x03f6, 0x03f7, 0x03f7, 0x03f9, 0x03fa, 0x03fa, 0x03fc, 0x03fd, 0x03fe, 0x03ff, 0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f, 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, 0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f, 0x0460, 0x0460, 0x0462, 0x0462, 0x0464, 0x0464, 0x0466, 0x0466, 0x0468, 0x0468, 0x046a, 0x046a, 0x046c, 0x046c, 0x046e, 0x046e, 0x0470, 0x0470, 0x0472, 0x0472, 0x0474, 0x0474, 0x0476, 0x0476, 0x0478, 0x0478, 0x047a, 0x047a, 0x047c, 0x047c, 0x047e, 0x047e, 0x0480, 0x0480, 0x0482, 0x0483, 0x0484, 0x0485, 0x0486, 0x0487, 0x0488, 0x0489, 0x048a, 0x048a, 0x048c, 0x048c, 0x048e, 0x048e, 0x0490, 0x0490, 0x0492, 0x0492, 0x0494, 0x0494, 0x0496, 0x0496, 0x0498, 0x0498, 0x049a, 0x049a, 0x049c, 0x049c, 0x049e, 0x049e, 0x04a0, 0x04a0, 0x04a2, 0x04a2, 0x04a4, 0x04a4, 0x04a6, 0x04a6, 0x04a8, 0x04a8, 0x04aa, 0x04aa, 0x04ac, 0x04ac, 0x04ae, 0x04ae, 0x04b0, 0x04b0, 0x04b2, 0x04b2, 0x04b4, 0x04b4, 0x04b6, 0x04b6, 0x04b8, 0x04b8, 0x04ba, 0x04ba, 0x04bc, 0x04bc, 0x04be, 0x04be, 0x04c0, 0x04c1, 0x04c1, 0x04c3, 0x04c3, 0x04c5, 0x04c5, 0x04c7, 0x04c7, 0x04c9, 0x04c9, 0x04cb, 0x04cb, 0x04cd, 0x04cd, 0x04c0, 0x04d0, 0x04d0, 0x04d2, 0x04d2, 0x04d4, 0x04d4, 0x04d6, 0x04d6, 0x04d8, 0x04d8, 0x04da, 0x04da, 0x04dc, 0x04dc, 0x04de, 0x04de, 0x04e0, 0x04e0, 0x04e2, 0x04e2, 0x04e4, 0x04e4, 0x04e6, 0x04e6, 0x04e8, 0x04e8, 0x04ea, 0x04ea, 0x04ec, 0x04ec, 0x04ee, 0x04ee, 0x04f0, 0x04f0, 0x04f2, 0x04f2, 0x04f4, 0x04f4, 0x04f6, 0x04f6, 0x04f8, 0x04f8, 0x04fa, 0x04fa, 0x04fc, 0x04fc, 0x04fe, 0x04fe, 0x0500, 0x0500, 0x0502, 0x0502, 0x0504, 0x0504, 0x0506, 0x0506, 0x0508, 0x0508, 0x050a, 0x050a, 0x050c, 0x050c, 0x050e, 0x050e, 0x0510, 0x0510, 0x0512, 0x0512, 0x0514, 0x0515, 0x0516, 0x0517, 0x0518, 0x0519, 0x051a, 0x051b, 0x051c, 0x051d, 0x051e, 0x051f, 0x0520, 0x0521, 0x0522, 0x0523, 0x0524, 0x0525, 0x0526, 0x0527, 0x0528, 0x0529, 0x052a, 0x052b, 0x052c, 0x052d, 0x052e, 0x052f, 0x0530, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537, 0x0538, 0x0539, 0x053a, 0x053b, 0x053c, 0x053d, 0x053e, 0x053f, 0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547, 0x0548, 0x0549, 0x054a, 0x054b, 0x054c, 0x054d, 0x054e, 0x054f, 0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0x0557, 0x0558, 0x0559, 0x055a, 0x055b, 0x055c, 0x055d, 0x055e, 0x055f, 0x0560, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537, 0x0538, 0x0539, 0x053a, 0x053b, 0x053c, 0x053d, 0x053e, 0x053f, 0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547, 0x0548, 0x0549, 0x054a, 0x054b, 0x054c, 0x054d, 0x054e, 0x054f, 0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0xffff, 0x17f6, 0x2c63, 0x1d7e, 0x1d7f, 0x1d80, 0x1d81, 0x1d82, 0x1d83, 0x1d84, 0x1d85, 0x1d86, 0x1d87, 0x1d88, 0x1d89, 0x1d8a, 0x1d8b, 0x1d8c, 0x1d8d, 0x1d8e, 0x1d8f, 0x1d90, 0x1d91, 0x1d92, 0x1d93, 0x1d94, 0x1d95, 0x1d96, 0x1d97, 0x1d98, 0x1d99, 0x1d9a, 0x1d9b, 0x1d9c, 0x1d9d, 0x1d9e, 0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3, 0x1da4, 0x1da5, 0x1da6, 0x1da7, 0x1da8, 0x1da9, 0x1daa, 0x1dab, 0x1dac, 0x1dad, 0x1dae, 0x1daf, 0x1db0, 0x1db1, 0x1db2, 0x1db3, 0x1db4, 0x1db5, 0x1db6, 0x1db7, 0x1db8, 0x1db9, 0x1dba, 0x1dbb, 0x1dbc, 0x1dbd, 0x1dbe, 0x1dbf, 0x1dc0, 0x1dc1, 0x1dc2, 0x1dc3, 0x1dc4, 0x1dc5, 0x1dc6, 0x1dc7, 0x1dc8, 0x1dc9, 0x1dca, 0x1dcb, 0x1dcc, 0x1dcd, 0x1dce, 0x1dcf, 0x1dd0, 0x1dd1, 0x1dd2, 0x1dd3, 0x1dd4, 0x1dd5, 0x1dd6, 0x1dd7, 0x1dd8, 0x1dd9, 0x1dda, 0x1ddb, 0x1ddc, 0x1ddd, 0x1dde, 0x1ddf, 0x1de0, 0x1de1, 0x1de2, 0x1de3, 0x1de4, 0x1de5, 0x1de6, 0x1de7, 0x1de8, 0x1de9, 0x1dea, 0x1deb, 0x1dec, 0x1ded, 0x1dee, 0x1def, 0x1df0, 0x1df1, 0x1df2, 0x1df3, 0x1df4, 0x1df5, 0x1df6, 0x1df7, 0x1df8, 0x1df9, 0x1dfa, 0x1dfb, 0x1dfc, 0x1dfd, 0x1dfe, 0x1dff, 0x1e00, 0x1e00, 0x1e02, 0x1e02, 0x1e04, 0x1e04, 0x1e06, 0x1e06, 0x1e08, 0x1e08, 0x1e0a, 0x1e0a, 0x1e0c, 0x1e0c, 0x1e0e, 0x1e0e, 0x1e10, 0x1e10, 0x1e12, 0x1e12, 0x1e14, 0x1e14, 0x1e16, 0x1e16, 0x1e18, 0x1e18, 0x1e1a, 0x1e1a, 0x1e1c, 0x1e1c, 0x1e1e, 0x1e1e, 0x1e20, 0x1e20, 0x1e22, 0x1e22, 0x1e24, 0x1e24, 0x1e26, 0x1e26, 0x1e28, 0x1e28, 0x1e2a, 0x1e2a, 0x1e2c, 0x1e2c, 0x1e2e, 0x1e2e, 0x1e30, 0x1e30, 0x1e32, 0x1e32, 0x1e34, 0x1e34, 0x1e36, 0x1e36, 0x1e38, 0x1e38, 0x1e3a, 0x1e3a, 0x1e3c, 0x1e3c, 0x1e3e, 0x1e3e, 0x1e40, 0x1e40, 0x1e42, 0x1e42, 0x1e44, 0x1e44, 0x1e46, 0x1e46, 0x1e48, 0x1e48, 0x1e4a, 0x1e4a, 0x1e4c, 0x1e4c, 0x1e4e, 0x1e4e, 0x1e50, 0x1e50, 0x1e52, 0x1e52, 0x1e54, 0x1e54, 0x1e56, 0x1e56, 0x1e58, 0x1e58, 0x1e5a, 0x1e5a, 0x1e5c, 0x1e5c, 0x1e5e, 0x1e5e, 0x1e60, 0x1e60, 0x1e62, 0x1e62, 0x1e64, 0x1e64, 0x1e66, 0x1e66, 0x1e68, 0x1e68, 0x1e6a, 0x1e6a, 0x1e6c, 0x1e6c, 0x1e6e, 0x1e6e, 0x1e70, 0x1e70, 0x1e72, 0x1e72, 0x1e74, 0x1e74, 0x1e76, 0x1e76, 0x1e78, 0x1e78, 0x1e7a, 0x1e7a, 0x1e7c, 0x1e7c, 0x1e7e, 0x1e7e, 0x1e80, 0x1e80, 0x1e82, 0x1e82, 0x1e84, 0x1e84, 0x1e86, 0x1e86, 0x1e88, 0x1e88, 0x1e8a, 0x1e8a, 0x1e8c, 0x1e8c, 0x1e8e, 0x1e8e, 0x1e90, 0x1e90, 0x1e92, 0x1e92, 0x1e94, 0x1e94, 0x1e96, 0x1e97, 0x1e98, 0x1e99, 0x1e9a, 0x1e9b, 0x1e9c, 0x1e9d, 0x1e9e, 0x1e9f, 0x1ea0, 0x1ea0, 0x1ea2, 0x1ea2, 0x1ea4, 0x1ea4, 0x1ea6, 0x1ea6, 0x1ea8, 0x1ea8, 0x1eaa, 0x1eaa, 0x1eac, 0x1eac, 0x1eae, 0x1eae, 0x1eb0, 0x1eb0, 0x1eb2, 0x1eb2, 0x1eb4, 0x1eb4, 0x1eb6, 0x1eb6, 0x1eb8, 0x1eb8, 0x1eba, 0x1eba, 0x1ebc, 0x1ebc, 0x1ebe, 0x1ebe, 0x1ec0, 0x1ec0, 0x1ec2, 0x1ec2, 0x1ec4, 0x1ec4, 0x1ec6, 0x1ec6, 0x1ec8, 0x1ec8, 0x1eca, 0x1eca, 0x1ecc, 0x1ecc, 0x1ece, 0x1ece, 0x1ed0, 0x1ed0, 0x1ed2, 0x1ed2, 0x1ed4, 0x1ed4, 0x1ed6, 0x1ed6, 0x1ed8, 0x1ed8, 0x1eda, 0x1eda, 0x1edc, 0x1edc, 0x1ede, 0x1ede, 0x1ee0, 0x1ee0, 0x1ee2, 0x1ee2, 0x1ee4, 0x1ee4, 0x1ee6, 0x1ee6, 0x1ee8, 0x1ee8, 0x1eea, 0x1eea, 0x1eec, 0x1eec, 0x1eee, 0x1eee, 0x1ef0, 0x1ef0, 0x1ef2, 0x1ef2, 0x1ef4, 0x1ef4, 0x1ef6, 0x1ef6, 0x1ef8, 0x1ef8, 0x1efa, 0x1efb, 0x1efc, 0x1efd, 0x1efe, 0x1eff, 0x1f08, 0x1f09, 0x1f0a, 0x1f0b, 0x1f0c, 0x1f0d, 0x1f0e, 0x1f0f, 0x1f08, 0x1f09, 0x1f0a, 0x1f0b, 0x1f0c, 0x1f0d, 0x1f0e, 0x1f0f, 0x1f18, 0x1f19, 0x1f1a, 0x1f1b, 0x1f1c, 0x1f1d, 0x1f16, 0x1f17, 0x1f18, 0x1f19, 0x1f1a, 0x1f1b, 0x1f1c, 0x1f1d, 0x1f1e, 0x1f1f, 0x1f28, 0x1f29, 0x1f2a, 0x1f2b, 0x1f2c, 0x1f2d, 0x1f2e, 0x1f2f, 0x1f28, 0x1f29, 0x1f2a, 0x1f2b, 0x1f2c, 0x1f2d, 0x1f2e, 0x1f2f, 0x1f38, 0x1f39, 0x1f3a, 0x1f3b, 0x1f3c, 0x1f3d, 0x1f3e, 0x1f3f, 0x1f38, 0x1f39, 0x1f3a, 0x1f3b, 0x1f3c, 0x1f3d, 0x1f3e, 0x1f3f, 0x1f48, 0x1f49, 0x1f4a, 0x1f4b, 0x1f4c, 0x1f4d, 0x1f46, 0x1f47, 0x1f48, 0x1f49, 0x1f4a, 0x1f4b, 0x1f4c, 0x1f4d, 0x1f4e, 0x1f4f, 0x1f50, 0x1f59, 0x1f52, 0x1f5b, 0x1f54, 0x1f5d, 0x1f56, 0x1f5f, 0x1f58, 0x1f59, 0x1f5a, 0x1f5b, 0x1f5c, 0x1f5d, 0x1f5e, 0x1f5f, 0x1f68, 0x1f69, 0x1f6a, 0x1f6b, 0x1f6c, 0x1f6d, 0x1f6e, 0x1f6f, 0x1f68, 0x1f69, 0x1f6a, 0x1f6b, 0x1f6c, 0x1f6d, 0x1f6e, 0x1f6f, 0x1fba, 0x1fbb, 0x1fc8, 0x1fc9, 0x1fca, 0x1fcb, 0x1fda, 0x1fdb, 0x1ff8, 0x1ff9, 0x1fea, 0x1feb, 0x1ffa, 0x1ffb, 0x1f7e, 0x1f7f, 0x1f88, 0x1f89, 0x1f8a, 0x1f8b, 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, 0x1f88, 0x1f89, 0x1f8a, 0x1f8b, 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, 0x1f98, 0x1f99, 0x1f9a, 0x1f9b, 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, 0x1f98, 0x1f99, 0x1f9a, 0x1f9b, 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, 0x1fa8, 0x1fa9, 0x1faa, 0x1fab, 0x1fac, 0x1fad, 0x1fae, 0x1faf, 0x1fa8, 0x1fa9, 0x1faa, 0x1fab, 0x1fac, 0x1fad, 0x1fae, 0x1faf, 0x1fb8, 0x1fb9, 0x1fb2, 0x1fbc, 0x1fb4, 0x1fb5, 0x1fb6, 0x1fb7, 0x1fb8, 0x1fb9, 0x1fba, 0x1fbb, 0x1fbc, 0x1fbd, 0x1fbe, 0x1fbf, 0x1fc0, 0x1fc1, 0x1fc2, 0x1fc3, 0x1fc4, 0x1fc5, 0x1fc6, 0x1fc7, 0x1fc8, 0x1fc9, 0x1fca, 0x1fcb, 0x1fc3, 0x1fcd, 0x1fce, 0x1fcf, 0x1fd8, 0x1fd9, 0x1fd2, 0x1fd3, 0x1fd4, 0x1fd5, 0x1fd6, 0x1fd7, 0x1fd8, 0x1fd9, 0x1fda, 0x1fdb, 0x1fdc, 0x1fdd, 0x1fde, 0x1fdf, 0x1fe8, 0x1fe9, 0x1fe2, 0x1fe3, 0x1fe4, 0x1fec, 0x1fe6, 0x1fe7, 0x1fe8, 0x1fe9, 0x1fea, 0x1feb, 0x1fec, 0x1fed, 0x1fee, 0x1fef, 0x1ff0, 0x1ff1, 0x1ff2, 0x1ff3, 0x1ff4, 0x1ff5, 0x1ff6, 0x1ff7, 0x1ff8, 0x1ff9, 0x1ffa, 0x1ffb, 0x1ff3, 0x1ffd, 0x1ffe, 0x1fff, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200a, 0x200b, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, 0x2016, 0x2017, 0x2018, 0x2019, 0x201a, 0x201b, 0x201c, 0x201d, 0x201e, 0x201f, 0x2020, 0x2021, 0x2022, 0x2023, 0x2024, 0x2025, 0x2026, 0x2027, 0x2028, 0x2029, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e, 0x202f, 0x2030, 0x2031, 0x2032, 0x2033, 0x2034, 0x2035, 0x2036, 0x2037, 0x2038, 0x2039, 0x203a, 0x203b, 0x203c, 0x203d, 0x203e, 0x203f, 0x2040, 0x2041, 0x2042, 0x2043, 0x2044, 0x2045, 0x2046, 0x2047, 0x2048, 0x2049, 0x204a, 0x204b, 0x204c, 0x204d, 0x204e, 0x204f, 0x2050, 0x2051, 0x2052, 0x2053, 0x2054, 0x2055, 0x2056, 0x2057, 0x2058, 0x2059, 0x205a, 0x205b, 0x205c, 0x205d, 0x205e, 0x205f, 0x2060, 0x2061, 0x2062, 0x2063, 0x2064, 0x2065, 0x2066, 0x2067, 0x2068, 0x2069, 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0x2070, 0x2071, 0x2072, 0x2073, 0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x207a, 0x207b, 0x207c, 0x207d, 0x207e, 0x207f, 0x2080, 0x2081, 0x2082, 0x2083, 0x2084, 0x2085, 0x2086, 0x2087, 0x2088, 0x2089, 0x208a, 0x208b, 0x208c, 0x208d, 0x208e, 0x208f, 0x2090, 0x2091, 0x2092, 0x2093, 0x2094, 0x2095, 0x2096, 0x2097, 0x2098, 0x2099, 0x209a, 0x209b, 0x209c, 0x209d, 0x209e, 0x209f, 0x20a0, 0x20a1, 0x20a2, 0x20a3, 0x20a4, 0x20a5, 0x20a6, 0x20a7, 0x20a8, 0x20a9, 0x20aa, 0x20ab, 0x20ac, 0x20ad, 0x20ae, 0x20af, 0x20b0, 0x20b1, 0x20b2, 0x20b3, 0x20b4, 0x20b5, 0x20b6, 0x20b7, 0x20b8, 0x20b9, 0x20ba, 0x20bb, 0x20bc, 0x20bd, 0x20be, 0x20bf, 0x20c0, 0x20c1, 0x20c2, 0x20c3, 0x20c4, 0x20c5, 0x20c6, 0x20c7, 0x20c8, 0x20c9, 0x20ca, 0x20cb, 0x20cc, 0x20cd, 0x20ce, 0x20cf, 0x20d0, 0x20d1, 0x20d2, 0x20d3, 0x20d4, 0x20d5, 0x20d6, 0x20d7, 0x20d8, 0x20d9, 0x20da, 0x20db, 0x20dc, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x20e1, 0x20e2, 0x20e3, 0x20e4, 0x20e5, 0x20e6, 0x20e7, 0x20e8, 0x20e9, 0x20ea, 0x20eb, 0x20ec, 0x20ed, 0x20ee, 0x20ef, 0x20f0, 0x20f1, 0x20f2, 0x20f3, 0x20f4, 0x20f5, 0x20f6, 0x20f7, 0x20f8, 0x20f9, 0x20fa, 0x20fb, 0x20fc, 0x20fd, 0x20fe, 0x20ff, 0x2100, 0x2101, 0x2102, 0x2103, 0x2104, 0x2105, 0x2106, 0x2107, 0x2108, 0x2109, 0x210a, 0x210b, 0x210c, 0x210d, 0x210e, 0x210f, 0x2110, 0x2111, 0x2112, 0x2113, 0x2114, 0x2115, 0x2116, 0x2117, 0x2118, 0x2119, 0x211a, 0x211b, 0x211c, 0x211d, 0x211e, 0x211f, 0x2120, 0x2121, 0x2122, 0x2123, 0x2124, 0x2125, 0x2126, 0x2127, 0x2128, 0x2129, 0x212a, 0x212b, 0x212c, 0x212d, 0x212e, 0x212f, 0x2130, 0x2131, 0x2132, 0x2133, 0x2134, 0x2135, 0x2136, 0x2137, 0x2138, 0x2139, 0x213a, 0x213b, 0x213c, 0x213d, 0x213e, 0x213f, 0x2140, 0x2141, 0x2142, 0x2143, 0x2144, 0x2145, 0x2146, 0x2147, 0x2148, 0x2149, 0x214a, 0x214b, 0x214c, 0x214d, 0x2132, 0x214f, 0x2150, 0x2151, 0x2152, 0x2153, 0x2154, 0x2155, 0x2156, 0x2157, 0x2158, 0x2159, 0x215a, 0x215b, 0x215c, 0x215d, 0x215e, 0x215f, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, 0x2167, 0x2168, 0x2169, 0x216a, 0x216b, 0x216c, 0x216d, 0x216e, 0x216f, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, 0x2167, 0x2168, 0x2169, 0x216a, 0x216b, 0x216c, 0x216d, 0x216e, 0x216f, 0x2180, 0x2181, 0x2182, 0x2183, 0x2183, 0xffff, 0x034b, 0x24b6, 0x24b7, 0x24b8, 0x24b9, 0x24ba, 0x24bb, 0x24bc, 0x24bd, 0x24be, 0x24bf, 0x24c0, 0x24c1, 0x24c2, 0x24c3, 0x24c4, 0x24c5, 0x24c6, 0x24c7, 0x24c8, 0x24c9, 0x24ca, 0x24cb, 0x24cc, 0x24cd, 0x24ce, 0x24cf, 0xffff, 0x0746, 0x2c00, 0x2c01, 0x2c02, 0x2c03, 0x2c04, 0x2c05, 0x2c06, 0x2c07, 0x2c08, 0x2c09, 0x2c0a, 0x2c0b, 0x2c0c, 0x2c0d, 0x2c0e, 0x2c0f, 0x2c10, 0x2c11, 0x2c12, 0x2c13, 0x2c14, 0x2c15, 0x2c16, 0x2c17, 0x2c18, 0x2c19, 0x2c1a, 0x2c1b, 0x2c1c, 0x2c1d, 0x2c1e, 0x2c1f, 0x2c20, 0x2c21, 0x2c22, 0x2c23, 0x2c24, 0x2c25, 0x2c26, 0x2c27, 0x2c28, 0x2c29, 0x2c2a, 0x2c2b, 0x2c2c, 0x2c2d, 0x2c2e, 0x2c5f, 0x2c60, 0x2c60, 0x2c62, 0x2c63, 0x2c64, 0x2c65, 0x2c66, 0x2c67, 0x2c67, 0x2c69, 0x2c69, 0x2c6b, 0x2c6b, 0x2c6d, 0x2c6e, 0x2c6f, 0x2c70, 0x2c71, 0x2c72, 0x2c73, 0x2c74, 0x2c75, 0x2c75, 0x2c77, 0x2c78, 0x2c79, 0x2c7a, 0x2c7b, 0x2c7c, 0x2c7d, 0x2c7e, 0x2c7f, 0x2c80, 0x2c80, 0x2c82, 0x2c82, 0x2c84, 0x2c84, 0x2c86, 0x2c86, 0x2c88, 0x2c88, 0x2c8a, 0x2c8a, 0x2c8c, 0x2c8c, 0x2c8e, 0x2c8e, 0x2c90, 0x2c90, 0x2c92, 0x2c92, 0x2c94, 0x2c94, 0x2c96, 0x2c96, 0x2c98, 0x2c98, 0x2c9a, 0x2c9a, 0x2c9c, 0x2c9c, 0x2c9e, 0x2c9e, 0x2ca0, 0x2ca0, 0x2ca2, 0x2ca2, 0x2ca4, 0x2ca4, 0x2ca6, 0x2ca6, 0x2ca8, 0x2ca8, 0x2caa, 0x2caa, 0x2cac, 0x2cac, 0x2cae, 0x2cae, 0x2cb0, 0x2cb0, 0x2cb2, 0x2cb2, 0x2cb4, 0x2cb4, 0x2cb6, 0x2cb6, 0x2cb8, 0x2cb8, 0x2cba, 0x2cba, 0x2cbc, 0x2cbc, 0x2cbe, 0x2cbe, 0x2cc0, 0x2cc0, 0x2cc2, 0x2cc2, 0x2cc4, 0x2cc4, 0x2cc6, 0x2cc6, 0x2cc8, 0x2cc8, 0x2cca, 0x2cca, 0x2ccc, 0x2ccc, 0x2cce, 0x2cce, 0x2cd0, 0x2cd0, 0x2cd2, 0x2cd2, 0x2cd4, 0x2cd4, 0x2cd6, 0x2cd6, 0x2cd8, 0x2cd8, 0x2cda, 0x2cda, 0x2cdc, 0x2cdc, 0x2cde, 0x2cde, 0x2ce0, 0x2ce0, 0x2ce2, 0x2ce2, 0x2ce4, 0x2ce5, 0x2ce6, 0x2ce7, 0x2ce8, 0x2ce9, 0x2cea, 0x2ceb, 0x2cec, 0x2ced, 0x2cee, 0x2cef, 0x2cf0, 0x2cf1, 0x2cf2, 0x2cf3, 0x2cf4, 0x2cf5, 0x2cf6, 0x2cf7, 0x2cf8, 0x2cf9, 0x2cfa, 0x2cfb, 0x2cfc, 0x2cfd, 0x2cfe, 0x2cff, 0x10a0, 0x10a1, 0x10a2, 0x10a3, 0x10a4, 0x10a5, 0x10a6, 0x10a7, 0x10a8, 0x10a9, 0x10aa, 0x10ab, 0x10ac, 0x10ad, 0x10ae, 0x10af, 0x10b0, 0x10b1, 0x10b2, 0x10b3, 0x10b4, 0x10b5, 0x10b6, 0x10b7, 0x10b8, 0x10b9, 0x10ba, 0x10bb, 0x10bc, 0x10bd, 0x10be, 0x10bf, 0x10c0, 0x10c1, 0x10c2, 0x10c3, 0x10c4, 0x10c5, 0xffff, 0xd21b, 0xff21, 0xff22, 0xff23, 0xff24, 0xff25, 0xff26, 0xff27, 0xff28, 0xff29, 0xff2a, 0xff2b, 0xff2c, 0xff2d, 0xff2e, 0xff2f, 0xff30, 0xff31, 0xff32, 0xff33, 0xff34, 0xff35, 0xff36, 0xff37, 0xff38, 0xff39, 0xff3a, 0xff5b, 0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67, 0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f, 0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77, 0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f, 0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87, 0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f, 0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97, 0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f, 0xffa0, 0xffa1, 0xffa2, 0xffa3, 0xffa4, 0xffa5, 0xffa6, 0xffa7, 0xffa8, 0xffa9, 0xffaa, 0xffab, 0xffac, 0xffad, 0xffae, 0xffaf, 0xffb0, 0xffb1, 0xffb2, 0xffb3, 0xffb4, 0xffb5, 0xffb6, 0xffb7, 0xffb8, 0xffb9, 0xffba, 0xffbb, 0xffbc, 0xffbd, 0xffbe, 0xffbf, 0xffc0, 0xffc1, 0xffc2, 0xffc3, 0xffc4, 0xffc5, 0xffc6, 0xffc7, 0xffc8, 0xffc9, 0xffca, 0xffcb, 0xffcc, 0xffcd, 0xffce, 0xffcf, 0xffd0, 0xffd1, 0xffd2, 0xffd3, 0xffd4, 0xffd5, 0xffd6, 0xffd7, 0xffd8, 0xffd9, 0xffda, 0xffdb, 0xffdc, 0xffdd, 0xffde, 0xffdf, 0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe7, 0xffe8, 0xffe9, 0xffea, 0xffeb, 0xffec, 0xffed, 0xffee, 0xffef, 0xfff0, 0xfff1, 0xfff2, 0xfff3, 0xfff4, 0xfff5, 0xfff6, 0xfff7, 0xfff8, 0xfff9, 0xfffa, 0xfffb, 0xfffc, 0xfffd, 0xfffe, 0xffff, }; /* * Allow full-width illegal characters : * "MS windows 7" supports full-width-invalid-name-characters. * So we should check half-width-invalid-name-characters(ASCII) only * for compatibility. * * " * / : < > ? \ | */ static unsigned short bad_uni_chars[] = { 0x0022, 0x002A, 0x002F, 0x003A, 0x003C, 0x003E, 0x003F, 0x005C, 0x007C, 0 }; static int exfat_convert_char_to_ucs2(struct nls_table *nls, const unsigned char *ch, int ch_len, unsigned short *ucs2, int *lossy) { int len; *ucs2 = 0x0; if (ch[0] < 0x80) { *ucs2 = ch[0]; return 1; } len = nls->char2uni(ch, ch_len, ucs2); if (len < 0) { /* conversion failed */ if (lossy != NULL) *lossy |= NLS_NAME_LOSSY; *ucs2 = '_'; return 1; } return len; } static int exfat_convert_ucs2_to_char(struct nls_table *nls, unsigned short ucs2, unsigned char *ch, int *lossy) { int len; ch[0] = 0x0; if (ucs2 < 0x0080) { ch[0] = ucs2; return 1; } len = nls->uni2char(ucs2, ch, MAX_CHARSET_SIZE); if (len < 0) { /* conversion failed */ if (lossy != NULL) *lossy |= NLS_NAME_LOSSY; ch[0] = '_'; return 1; } return len; } unsigned short exfat_toupper(struct super_block *sb, unsigned short a) { struct exfat_sb_info *sbi = EXFAT_SB(sb); return sbi->vol_utbl[a] ? sbi->vol_utbl[a] : a; } static unsigned short *exfat_wstrchr(unsigned short *str, unsigned short wchar) { while (*str) { if (*(str++) == wchar) return str; } return NULL; } int exfat_uniname_ncmp(struct super_block *sb, unsigned short *a, unsigned short *b, unsigned int len) { int i; for (i = 0; i < len; i++, a++, b++) if (exfat_toupper(sb, *a) != exfat_toupper(sb, *b)) return 1; return 0; } static int exfat_utf16_to_utf8(struct super_block *sb, struct exfat_uni_name *p_uniname, unsigned char *p_cstring, int buflen) { int len; const unsigned short *uniname = p_uniname->name; /* always len >= 0 */ len = utf16s_to_utf8s(uniname, MAX_NAME_LENGTH, UTF16_HOST_ENDIAN, p_cstring, buflen); p_cstring[len] = '\0'; return len; } static int exfat_utf8_to_utf16(struct super_block *sb, const unsigned char *p_cstring, const int len, struct exfat_uni_name *p_uniname, int *p_lossy) { int i, unilen, lossy = NLS_NAME_NO_LOSSY; __le16 upname[MAX_NAME_LENGTH + 1]; unsigned short *uniname = p_uniname->name; WARN_ON(!len); unilen = utf8s_to_utf16s(p_cstring, len, UTF16_HOST_ENDIAN, (wchar_t *)uniname, MAX_NAME_LENGTH + 2); if (unilen < 0) { exfat_err(sb, "failed to %s (err : %d) nls len : %d", __func__, unilen, len); return unilen; } if (unilen > MAX_NAME_LENGTH) { exfat_debug(sb, "failed to %s (estr:ENAMETOOLONG) nls len : %d, unilen : %d > %d", __func__, len, unilen, MAX_NAME_LENGTH); return -ENAMETOOLONG; } for (i = 0; i < unilen; i++) { if (*uniname < 0x0020 || exfat_wstrchr(bad_uni_chars, *uniname)) lossy |= NLS_NAME_LOSSY; upname[i] = cpu_to_le16(exfat_toupper(sb, *uniname)); uniname++; } *uniname = '\0'; p_uniname->name_len = unilen; p_uniname->name_hash = exfat_calc_chksum16(upname, unilen << 1, 0, CS_DEFAULT); if (p_lossy) *p_lossy = lossy; return unilen; } #define SURROGATE_MASK 0xfffff800 #define SURROGATE_PAIR 0x0000d800 #define SURROGATE_LOW 0x00000400 static int __exfat_utf16_to_nls(struct super_block *sb, struct exfat_uni_name *p_uniname, unsigned char *p_cstring, int buflen) { int i, j, len, out_len = 0; unsigned char buf[MAX_CHARSET_SIZE]; const unsigned short *uniname = p_uniname->name; struct nls_table *nls = EXFAT_SB(sb)->nls_io; i = 0; while (i < MAX_NAME_LENGTH && out_len < (buflen - 1)) { if (*uniname == '\0') break; if ((*uniname & SURROGATE_MASK) != SURROGATE_PAIR) { len = exfat_convert_ucs2_to_char(nls, *uniname, buf, NULL); } else { /* Process UTF-16 surrogate pair as one character */ if (!(*uniname & SURROGATE_LOW) && i+1 < MAX_NAME_LENGTH && (*(uniname+1) & SURROGATE_MASK) == SURROGATE_PAIR && (*(uniname+1) & SURROGATE_LOW)) { uniname++; i++; } /* * UTF-16 surrogate pair encodes code points above * U+FFFF. Code points above U+FFFF are not supported * by kernel NLS framework therefore use replacement * character */ len = 1; buf[0] = '_'; } if (out_len + len >= buflen) len = buflen - 1 - out_len; out_len += len; if (len > 1) { for (j = 0; j < len; j++) *p_cstring++ = buf[j]; } else { /* len == 1 */ *p_cstring++ = *buf; } uniname++; i++; } *p_cstring = '\0'; return out_len; } static int exfat_nls_to_ucs2(struct super_block *sb, const unsigned char *p_cstring, const int len, struct exfat_uni_name *p_uniname, int *p_lossy) { int i = 0, unilen = 0, lossy = NLS_NAME_NO_LOSSY; __le16 upname[MAX_NAME_LENGTH + 1]; unsigned short *uniname = p_uniname->name; struct nls_table *nls = EXFAT_SB(sb)->nls_io; WARN_ON(!len); while (unilen < MAX_NAME_LENGTH && i < len) { i += exfat_convert_char_to_ucs2(nls, p_cstring + i, len - i, uniname, &lossy); if (*uniname < 0x0020 || exfat_wstrchr(bad_uni_chars, *uniname)) lossy |= NLS_NAME_LOSSY; upname[unilen] = cpu_to_le16(exfat_toupper(sb, *uniname)); uniname++; unilen++; } if (p_cstring[i] != '\0') lossy |= NLS_NAME_OVERLEN; *uniname = '\0'; p_uniname->name_len = unilen; p_uniname->name_hash = exfat_calc_chksum16(upname, unilen << 1, 0, CS_DEFAULT); if (p_lossy) *p_lossy = lossy; return unilen; } int exfat_utf16_to_nls(struct super_block *sb, struct exfat_uni_name *uniname, unsigned char *p_cstring, int buflen) { if (EXFAT_SB(sb)->options.utf8) return exfat_utf16_to_utf8(sb, uniname, p_cstring, buflen); return __exfat_utf16_to_nls(sb, uniname, p_cstring, buflen); } int exfat_nls_to_utf16(struct super_block *sb, const unsigned char *p_cstring, const int len, struct exfat_uni_name *uniname, int *p_lossy) { if (EXFAT_SB(sb)->options.utf8) return exfat_utf8_to_utf16(sb, p_cstring, len, uniname, p_lossy); return exfat_nls_to_ucs2(sb, p_cstring, len, uniname, p_lossy); } static int exfat_load_upcase_table(struct super_block *sb, sector_t sector, unsigned long long num_sectors, unsigned int utbl_checksum) { struct exfat_sb_info *sbi = EXFAT_SB(sb); unsigned int sect_size = sb->s_blocksize; unsigned int i, index = 0; u32 chksum = 0; int ret; unsigned char skip = false; unsigned short *upcase_table; upcase_table = kvcalloc(UTBL_COUNT, sizeof(unsigned short), GFP_KERNEL); if (!upcase_table) return -ENOMEM; sbi->vol_utbl = upcase_table; num_sectors += sector; while (sector < num_sectors) { struct buffer_head *bh; bh = sb_bread(sb, sector); if (!bh) { exfat_err(sb, "failed to read sector(0x%llx)", (unsigned long long)sector); ret = -EIO; goto free_table; } sector++; for (i = 0; i < sect_size && index <= 0xFFFF; i += 2) { unsigned short uni = get_unaligned_le16(bh->b_data + i); if (skip) { index += uni; skip = false; } else if (uni == index) { index++; } else if (uni == 0xFFFF) { skip = true; } else { /* uni != index , uni != 0xFFFF */ upcase_table[index] = uni; index++; } } chksum = exfat_calc_chksum32(bh->b_data, i, chksum, CS_DEFAULT); brelse(bh); } if (index >= 0xFFFF && utbl_checksum == chksum) return 0; exfat_err(sb, "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)", index, chksum, utbl_checksum); ret = -EINVAL; free_table: exfat_free_upcase_table(sbi); return ret; } static int exfat_load_default_upcase_table(struct super_block *sb) { int i, ret = -EIO; struct exfat_sb_info *sbi = EXFAT_SB(sb); unsigned char skip = false; unsigned short uni = 0, *upcase_table; unsigned int index = 0; upcase_table = kvcalloc(UTBL_COUNT, sizeof(unsigned short), GFP_KERNEL); if (!upcase_table) return -ENOMEM; sbi->vol_utbl = upcase_table; for (i = 0; index <= 0xFFFF && i < EXFAT_NUM_UPCASE; i++) { uni = uni_def_upcase[i]; if (skip) { index += uni; skip = false; } else if (uni == index) { index++; } else if (uni == 0xFFFF) { skip = true; } else { upcase_table[index] = uni; index++; } } if (index >= 0xFFFF) return 0; /* FATAL error: default upcase table has error */ exfat_free_upcase_table(sbi); return ret; } int exfat_create_upcase_table(struct super_block *sb) { int i, ret; unsigned int tbl_clu, type; sector_t sector; unsigned long long tbl_size, num_sectors; unsigned char blksize_bits = sb->s_blocksize_bits; struct exfat_chain clu; struct exfat_dentry *ep; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct buffer_head *bh; clu.dir = sbi->root_dir; clu.flags = ALLOC_FAT_CHAIN; while (clu.dir != EXFAT_EOF_CLUSTER) { for (i = 0; i < sbi->dentries_per_clu; i++) { ep = exfat_get_dentry(sb, &clu, i, &bh); if (!ep) return -EIO; type = exfat_get_entry_type(ep); if (type == TYPE_UNUSED) { brelse(bh); break; } if (type != TYPE_UPCASE) { brelse(bh); continue; } tbl_clu = le32_to_cpu(ep->dentry.upcase.start_clu); tbl_size = le64_to_cpu(ep->dentry.upcase.size); sector = exfat_cluster_to_sector(sbi, tbl_clu); num_sectors = ((tbl_size - 1) >> blksize_bits) + 1; ret = exfat_load_upcase_table(sb, sector, num_sectors, le32_to_cpu(ep->dentry.upcase.checksum)); brelse(bh); if (ret && ret != -EIO) goto load_default; /* load successfully */ return ret; } if (exfat_get_next_cluster(sb, &(clu.dir))) return -EIO; } load_default: /* load default upcase table */ return exfat_load_default_upcase_table(sb); } void exfat_free_upcase_table(struct exfat_sb_info *sbi) { kvfree(sbi->vol_utbl); }
34 494 493 1 1 1 1 1 1 19 1 12 1 32 20 12 32 32 32 32 34 33 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 // SPDX-License-Identifier: GPL-2.0 /* * usb port device code * * Copyright (C) 2012 Intel Corp * * Author: Lan Tianyu <tianyu.lan@intel.com> */ #include <linux/kstrtox.h> #include <linux/slab.h> #include <linux/pm_qos.h> #include <linux/component.h> #include "hub.h" static int usb_port_block_power_off; static const struct attribute_group *port_dev_group[]; static ssize_t early_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_port *port_dev = to_usb_port(dev); return sysfs_emit(buf, "%s\n", port_dev->early_stop ? "yes" : "no"); } static ssize_t early_stop_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct usb_port *port_dev = to_usb_port(dev); bool value; if (kstrtobool(buf, &value)) return -EINVAL; if (value) port_dev->early_stop = 1; else port_dev->early_stop = 0; return count; } static DEVICE_ATTR_RW(early_stop); static ssize_t disable_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_port *port_dev = to_usb_port(dev); struct usb_device *hdev = to_usb_device(dev->parent->parent); struct usb_hub *hub = usb_hub_to_struct_hub(hdev); struct usb_interface *intf = to_usb_interface(hub->intfdev); int port1 = port_dev->portnum; u16 portstatus, unused; bool disabled; int rc; rc = usb_autopm_get_interface(intf); if (rc < 0) return rc; usb_lock_device(hdev); if (hub->disconnected) { rc = -ENODEV; goto out_hdev_lock; } usb_hub_port_status(hub, port1, &portstatus, &unused); disabled = !usb_port_is_power_on(hub, portstatus); out_hdev_lock: usb_unlock_device(hdev); usb_autopm_put_interface(intf); if (rc) return rc; return sysfs_emit(buf, "%s\n", disabled ? "1" : "0"); } static ssize_t disable_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct usb_port *port_dev = to_usb_port(dev); struct usb_device *hdev = to_usb_device(dev->parent->parent); struct usb_hub *hub = usb_hub_to_struct_hub(hdev); struct usb_interface *intf = to_usb_interface(hub->intfdev); int port1 = port_dev->portnum; bool disabled; int rc; rc = kstrtobool(buf, &disabled); if (rc) return rc; rc = usb_autopm_get_interface(intf); if (rc < 0) return rc; usb_lock_device(hdev); if (hub->disconnected) { rc = -ENODEV; goto out_hdev_lock; } if (disabled && port_dev->child) usb_disconnect(&port_dev->child); rc = usb_hub_set_port_power(hdev, hub, port1, !disabled); if (disabled) { usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_CONNECTION); if (!port_dev->is_superspeed) usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_ENABLE); } if (!rc) rc = count; out_hdev_lock: usb_unlock_device(hdev); usb_autopm_put_interface(intf); return rc; } static DEVICE_ATTR_RW(disable); static ssize_t location_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_port *port_dev = to_usb_port(dev); return sprintf(buf, "0x%08x\n", port_dev->location); } static DEVICE_ATTR_RO(location); static ssize_t connect_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_port *port_dev = to_usb_port(dev); char *result; switch (port_dev->connect_type) { case USB_PORT_CONNECT_TYPE_HOT_PLUG: result = "hotplug"; break; case USB_PORT_CONNECT_TYPE_HARD_WIRED: result = "hardwired"; break; case USB_PORT_NOT_USED: result = "not used"; break; default: result = "unknown"; break; } return sprintf(buf, "%s\n", result); } static DEVICE_ATTR_RO(connect_type); static ssize_t state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_port *port_dev = to_usb_port(dev); enum usb_device_state state = READ_ONCE(port_dev->state); return sysfs_emit(buf, "%s\n", usb_state_string(state)); } static DEVICE_ATTR_RO(state); static ssize_t over_current_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_port *port_dev = to_usb_port(dev); return sprintf(buf, "%u\n", port_dev->over_current_count); } static DEVICE_ATTR_RO(over_current_count); static ssize_t quirks_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_port *port_dev = to_usb_port(dev); return sprintf(buf, "%08x\n", port_dev->quirks); } static ssize_t quirks_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct usb_port *port_dev = to_usb_port(dev); u32 value; if (kstrtou32(buf, 16, &value)) return -EINVAL; port_dev->quirks = value; return count; } static DEVICE_ATTR_RW(quirks); static ssize_t usb3_lpm_permit_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_port *port_dev = to_usb_port(dev); const char *p; if (port_dev->usb3_lpm_u1_permit) { if (port_dev->usb3_lpm_u2_permit) p = "u1_u2"; else p = "u1"; } else { if (port_dev->usb3_lpm_u2_permit) p = "u2"; else p = "0"; } return sprintf(buf, "%s\n", p); } static ssize_t usb3_lpm_permit_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct usb_port *port_dev = to_usb_port(dev); struct usb_device *udev = port_dev->child; struct usb_hcd *hcd; if (!strncmp(buf, "u1_u2", 5)) { port_dev->usb3_lpm_u1_permit = 1; port_dev->usb3_lpm_u2_permit = 1; } else if (!strncmp(buf, "u1", 2)) { port_dev->usb3_lpm_u1_permit = 1; port_dev->usb3_lpm_u2_permit = 0; } else if (!strncmp(buf, "u2", 2)) { port_dev->usb3_lpm_u1_permit = 0; port_dev->usb3_lpm_u2_permit = 1; } else if (!strncmp(buf, "0", 1)) { port_dev->usb3_lpm_u1_permit = 0; port_dev->usb3_lpm_u2_permit = 0; } else return -EINVAL; /* If device is connected to the port, disable or enable lpm * to make new u1 u2 setting take effect immediately. */ if (udev) { hcd = bus_to_hcd(udev->bus); if (!hcd) return -EINVAL; usb_lock_device(udev); mutex_lock(hcd->bandwidth_mutex); if (!usb_disable_lpm(udev)) usb_enable_lpm(udev); mutex_unlock(hcd->bandwidth_mutex); usb_unlock_device(udev); } return count; } static DEVICE_ATTR_RW(usb3_lpm_permit); static struct attribute *port_dev_attrs[] = { &dev_attr_connect_type.attr, &dev_attr_state.attr, &dev_attr_location.attr, &dev_attr_quirks.attr, &dev_attr_over_current_count.attr, &dev_attr_disable.attr, &dev_attr_early_stop.attr, NULL, }; static const struct attribute_group port_dev_attr_grp = { .attrs = port_dev_attrs, }; static const struct attribute_group *port_dev_group[] = { &port_dev_attr_grp, NULL, }; static struct attribute *port_dev_usb3_attrs[] = { &dev_attr_usb3_lpm_permit.attr, NULL, }; static const struct attribute_group port_dev_usb3_attr_grp = { .attrs = port_dev_usb3_attrs, }; static const struct attribute_group *port_dev_usb3_group[] = { &port_dev_attr_grp, &port_dev_usb3_attr_grp, NULL, }; static void usb_port_device_release(struct device *dev) { struct usb_port *port_dev = to_usb_port(dev); kfree(port_dev->req); kfree(port_dev); } #ifdef CONFIG_PM static int usb_port_runtime_resume(struct device *dev) { struct usb_port *port_dev = to_usb_port(dev); struct usb_device *hdev = to_usb_device(dev->parent->parent); struct usb_interface *intf = to_usb_interface(dev->parent); struct usb_hub *hub = usb_hub_to_struct_hub(hdev); struct usb_device *udev = port_dev->child; struct usb_port *peer = port_dev->peer; int port1 = port_dev->portnum; int retval; if (!hub) return -EINVAL; if (hub->in_reset) { set_bit(port1, hub->power_bits); return 0; } /* * Power on our usb3 peer before this usb2 port to prevent a usb3 * device from degrading to its usb2 connection */ if (!port_dev->is_superspeed && peer) pm_runtime_get_sync(&peer->dev); retval = usb_autopm_get_interface(intf); if (retval < 0) return retval; retval = usb_hub_set_port_power(hdev, hub, port1, true); msleep(hub_power_on_good_delay(hub)); if (udev && !retval) { /* * Our preference is to simply wait for the port to reconnect, * as that is the lowest latency method to restart the port. * However, there are cases where toggling port power results in * the host port and the device port getting out of sync causing * a link training live lock. Upon timeout, flag the port as * needing warm reset recovery (to be performed later by * usb_port_resume() as requested via usb_wakeup_notification()) */ if (hub_port_debounce_be_connected(hub, port1) < 0) { dev_dbg(&port_dev->dev, "reconnect timeout\n"); if (hub_is_superspeed(hdev)) set_bit(port1, hub->warm_reset_bits); } /* Force the child awake to revalidate after the power loss. */ if (!test_and_set_bit(port1, hub->child_usage_bits)) { pm_runtime_get_noresume(&port_dev->dev); pm_request_resume(&udev->dev); } } usb_autopm_put_interface(intf); return retval; } static int usb_port_runtime_suspend(struct device *dev) { struct usb_port *port_dev = to_usb_port(dev); struct usb_device *hdev = to_usb_device(dev->parent->parent); struct usb_interface *intf = to_usb_interface(dev->parent); struct usb_hub *hub = usb_hub_to_struct_hub(hdev); struct usb_port *peer = port_dev->peer; int port1 = port_dev->portnum; int retval; if (!hub) return -EINVAL; if (hub->in_reset) return -EBUSY; if (dev_pm_qos_flags(&port_dev->dev, PM_QOS_FLAG_NO_POWER_OFF) == PM_QOS_FLAGS_ALL) return -EAGAIN; if (usb_port_block_power_off) return -EBUSY; retval = usb_autopm_get_interface(intf); if (retval < 0) return retval; retval = usb_hub_set_port_power(hdev, hub, port1, false); usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_CONNECTION); if (!port_dev->is_superspeed) usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_ENABLE); usb_autopm_put_interface(intf); /* * Our peer usb3 port may now be able to suspend, so * asynchronously queue a suspend request to observe that this * usb2 port is now off. */ if (!port_dev->is_superspeed && peer) pm_runtime_put(&peer->dev); return retval; } #endif static void usb_port_shutdown(struct device *dev) { struct usb_port *port_dev = to_usb_port(dev); if (port_dev->child) usb_disable_usb2_hardware_lpm(port_dev->child); } static const struct dev_pm_ops usb_port_pm_ops = { #ifdef CONFIG_PM .runtime_suspend = usb_port_runtime_suspend, .runtime_resume = usb_port_runtime_resume, #endif }; struct device_type usb_port_device_type = { .name = "usb_port", .release = usb_port_device_release, .pm = &usb_port_pm_ops, }; static struct device_driver usb_port_driver = { .name = "usb", .owner = THIS_MODULE, .shutdown = usb_port_shutdown, }; static int link_peers(struct usb_port *left, struct usb_port *right) { struct usb_port *ss_port, *hs_port; int rc; if (left->peer == right && right->peer == left) return 0; if (left->peer || right->peer) { struct usb_port *lpeer = left->peer; struct usb_port *rpeer = right->peer; char *method; if (left->location && left->location == right->location) method = "location"; else method = "default"; pr_debug("usb: failed to peer %s and %s by %s (%s:%s) (%s:%s)\n", dev_name(&left->dev), dev_name(&right->dev), method, dev_name(&left->dev), lpeer ? dev_name(&lpeer->dev) : "none", dev_name(&right->dev), rpeer ? dev_name(&rpeer->dev) : "none"); return -EBUSY; } rc = sysfs_create_link(&left->dev.kobj, &right->dev.kobj, "peer"); if (rc) return rc; rc = sysfs_create_link(&right->dev.kobj, &left->dev.kobj, "peer"); if (rc) { sysfs_remove_link(&left->dev.kobj, "peer"); return rc; } /* * We need to wake the HiSpeed port to make sure we don't race * setting ->peer with usb_port_runtime_suspend(). Otherwise we * may miss a suspend event for the SuperSpeed port. */ if (left->is_superspeed) { ss_port = left; WARN_ON(right->is_superspeed); hs_port = right; } else { ss_port = right; WARN_ON(!right->is_superspeed); hs_port = left; } pm_runtime_get_sync(&hs_port->dev); left->peer = right; right->peer = left; /* * The SuperSpeed reference is dropped when the HiSpeed port in * this relationship suspends, i.e. when it is safe to allow a * SuperSpeed connection to drop since there is no risk of a * device degrading to its powered-off HiSpeed connection. * * Also, drop the HiSpeed ref taken above. */ pm_runtime_get_sync(&ss_port->dev); pm_runtime_put(&hs_port->dev); return 0; } static void link_peers_report(struct usb_port *left, struct usb_port *right) { int rc; rc = link_peers(left, right); if (rc == 0) { dev_dbg(&left->dev, "peered to %s\n", dev_name(&right->dev)); } else { dev_dbg(&left->dev, "failed to peer to %s (%d)\n", dev_name(&right->dev), rc); pr_warn_once("usb: port power management may be unreliable\n"); usb_port_block_power_off = 1; } } static void unlink_peers(struct usb_port *left, struct usb_port *right) { struct usb_port *ss_port, *hs_port; WARN(right->peer != left || left->peer != right, "%s and %s are not peers?\n", dev_name(&left->dev), dev_name(&right->dev)); /* * We wake the HiSpeed port to make sure we don't race its * usb_port_runtime_resume() event which takes a SuperSpeed ref * when ->peer is !NULL. */ if (left->is_superspeed) { ss_port = left; hs_port = right; } else { ss_port = right; hs_port = left; } pm_runtime_get_sync(&hs_port->dev); sysfs_remove_link(&left->dev.kobj, "peer"); right->peer = NULL; sysfs_remove_link(&right->dev.kobj, "peer"); left->peer = NULL; /* Drop the SuperSpeed ref held on behalf of the active HiSpeed port */ pm_runtime_put(&ss_port->dev); /* Drop the ref taken above */ pm_runtime_put(&hs_port->dev); } /* * For each usb hub device in the system check to see if it is in the * peer domain of the given port_dev, and if it is check to see if it * has a port that matches the given port by location */ static int match_location(struct usb_device *peer_hdev, void *p) { int port1; struct usb_hcd *hcd, *peer_hcd; struct usb_port *port_dev = p, *peer; struct usb_hub *peer_hub = usb_hub_to_struct_hub(peer_hdev); struct usb_device *hdev = to_usb_device(port_dev->dev.parent->parent); if (!peer_hub) return 0; hcd = bus_to_hcd(hdev->bus); peer_hcd = bus_to_hcd(peer_hdev->bus); /* peer_hcd is provisional until we verify it against the known peer */ if (peer_hcd != hcd->shared_hcd) return 0; for (port1 = 1; port1 <= peer_hdev->maxchild; port1++) { peer = peer_hub->ports[port1 - 1]; if (peer && peer->location == port_dev->location) { link_peers_report(port_dev, peer); return 1; /* done */ } } return 0; } /* * Find the peer port either via explicit platform firmware "location" * data, the peer hcd for root hubs, or the upstream peer relationship * for all other hubs. */ static void find_and_link_peer(struct usb_hub *hub, int port1) { struct usb_port *port_dev = hub->ports[port1 - 1], *peer; struct usb_device *hdev = hub->hdev; struct usb_device *peer_hdev; struct usb_hub *peer_hub; /* * If location data is available then we can only peer this port * by a location match, not the default peer (lest we create a * situation where we need to go back and undo a default peering * when the port is later peered by location data) */ if (port_dev->location) { /* we link the peer in match_location() if found */ usb_for_each_dev(port_dev, match_location); return; } else if (!hdev->parent) { struct usb_hcd *hcd = bus_to_hcd(hdev->bus); struct usb_hcd *peer_hcd = hcd->shared_hcd; if (!peer_hcd) return; peer_hdev = peer_hcd->self.root_hub; } else { struct usb_port *upstream; struct usb_device *parent = hdev->parent; struct usb_hub *parent_hub = usb_hub_to_struct_hub(parent); if (!parent_hub) return; upstream = parent_hub->ports[hdev->portnum - 1]; if (!upstream || !upstream->peer) return; peer_hdev = upstream->peer->child; } peer_hub = usb_hub_to_struct_hub(peer_hdev); if (!peer_hub || port1 > peer_hdev->maxchild) return; /* * we found a valid default peer, last check is to make sure it * does not have location data */ peer = peer_hub->ports[port1 - 1]; if (peer && peer->location == 0) link_peers_report(port_dev, peer); } static int connector_bind(struct device *dev, struct device *connector, void *data) { struct usb_port *port_dev = to_usb_port(dev); int ret; ret = sysfs_create_link(&dev->kobj, &connector->kobj, "connector"); if (ret) return ret; ret = sysfs_create_link(&connector->kobj, &dev->kobj, dev_name(dev)); if (ret) { sysfs_remove_link(&dev->kobj, "connector"); return ret; } port_dev->connector = data; /* * If there is already USB device connected to the port, letting the * Type-C connector know about it immediately. */ if (port_dev->child) typec_attach(port_dev->connector, &port_dev->child->dev); return 0; } static void connector_unbind(struct device *dev, struct device *connector, void *data) { struct usb_port *port_dev = to_usb_port(dev); sysfs_remove_link(&connector->kobj, dev_name(dev)); sysfs_remove_link(&dev->kobj, "connector"); port_dev->connector = NULL; } static const struct component_ops connector_ops = { .bind = connector_bind, .unbind = connector_unbind, }; int usb_hub_create_port_device(struct usb_hub *hub, int port1) { struct usb_port *port_dev; struct usb_device *hdev = hub->hdev; int retval; port_dev = kzalloc(sizeof(*port_dev), GFP_KERNEL); if (!port_dev) return -ENOMEM; port_dev->req = kzalloc(sizeof(*(port_dev->req)), GFP_KERNEL); if (!port_dev->req) { kfree(port_dev); return -ENOMEM; } hub->ports[port1 - 1] = port_dev; port_dev->portnum = port1; set_bit(port1, hub->power_bits); port_dev->dev.parent = hub->intfdev; if (hub_is_superspeed(hdev)) { port_dev->is_superspeed = 1; port_dev->usb3_lpm_u1_permit = 1; port_dev->usb3_lpm_u2_permit = 1; port_dev->dev.groups = port_dev_usb3_group; } else port_dev->dev.groups = port_dev_group; port_dev->dev.type = &usb_port_device_type; port_dev->dev.driver = &usb_port_driver; dev_set_name(&port_dev->dev, "%s-port%d", dev_name(&hub->hdev->dev), port1); mutex_init(&port_dev->status_lock); retval = device_register(&port_dev->dev); if (retval) { put_device(&port_dev->dev); return retval; } port_dev->state_kn = sysfs_get_dirent(port_dev->dev.kobj.sd, "state"); if (!port_dev->state_kn) { dev_err(&port_dev->dev, "failed to sysfs_get_dirent 'state'\n"); retval = -ENODEV; goto err_unregister; } /* Set default policy of port-poweroff disabled. */ retval = dev_pm_qos_add_request(&port_dev->dev, port_dev->req, DEV_PM_QOS_FLAGS, PM_QOS_FLAG_NO_POWER_OFF); if (retval < 0) { goto err_put_kn; } retval = component_add(&port_dev->dev, &connector_ops); if (retval) { dev_warn(&port_dev->dev, "failed to add component\n"); goto err_put_kn; } find_and_link_peer(hub, port1); /* * Enable runtime pm and hold a refernce that hub_configure() * will drop once the PM_QOS_NO_POWER_OFF flag state has been set * and the hub has been fully registered (hdev->maxchild set). */ pm_runtime_set_active(&port_dev->dev); pm_runtime_get_noresume(&port_dev->dev); pm_runtime_enable(&port_dev->dev); device_enable_async_suspend(&port_dev->dev); /* * Keep hidden the ability to enable port-poweroff if the hub * does not support power switching. */ if (!hub_is_port_power_switchable(hub)) return 0; /* Attempt to let userspace take over the policy. */ retval = dev_pm_qos_expose_flags(&port_dev->dev, PM_QOS_FLAG_NO_POWER_OFF); if (retval < 0) { dev_warn(&port_dev->dev, "failed to expose pm_qos_no_poweroff\n"); return 0; } /* Userspace owns the policy, drop the kernel 'no_poweroff' request. */ retval = dev_pm_qos_remove_request(port_dev->req); if (retval >= 0) { kfree(port_dev->req); port_dev->req = NULL; } return 0; err_put_kn: sysfs_put(port_dev->state_kn); err_unregister: device_unregister(&port_dev->dev); return retval; } void usb_hub_remove_port_device(struct usb_hub *hub, int port1) { struct usb_port *port_dev = hub->ports[port1 - 1]; struct usb_port *peer; peer = port_dev->peer; if (peer) unlink_peers(port_dev, peer); component_del(&port_dev->dev, &connector_ops); sysfs_put(port_dev->state_kn); device_unregister(&port_dev->dev); }
78 76 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 // SPDX-License-Identifier: GPL-2.0-or-later /* Null security operations. * * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <net/af_rxrpc.h> #include "ar-internal.h" static int none_init_connection_security(struct rxrpc_connection *conn, struct rxrpc_key_token *token) { return 0; } /* * Work out how much data we can put in an unsecured packet. */ static int none_how_much_data(struct rxrpc_call *call, size_t remain, size_t *_buf_size, size_t *_data_size, size_t *_offset) { *_buf_size = *_data_size = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); *_offset = 0; return 0; } static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { return 0; } static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); sp->flags |= RXRPC_RX_VERIFIED; return 0; } static void none_free_call_crypto(struct rxrpc_call *call) { } static int none_respond_to_challenge(struct rxrpc_connection *conn, struct sk_buff *skb) { return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, rxrpc_eproto_rxnull_challenge); } static int none_verify_response(struct rxrpc_connection *conn, struct sk_buff *skb) { return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, rxrpc_eproto_rxnull_response); } static void none_clear(struct rxrpc_connection *conn) { } static int none_init(void) { return 0; } static void none_exit(void) { } /* * RxRPC Kerberos-based security */ const struct rxrpc_security rxrpc_no_security = { .name = "none", .security_index = RXRPC_SECURITY_NONE, .init = none_init, .exit = none_exit, .init_connection_security = none_init_connection_security, .free_call_crypto = none_free_call_crypto, .how_much_data = none_how_much_data, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, .respond_to_challenge = none_respond_to_challenge, .verify_response = none_verify_response, .clear = none_clear, };
152 71 689 318 1 10 8 7 11 1 57 9 16 9 18 1 1 4 3 7 4 2 1 1 10 3 3 6 21 21 3 5 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _INET_ECN_H_ #define _INET_ECN_H_ #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #include <net/inet_sock.h> #include <net/dsfield.h> #include <net/checksum.h> enum { INET_ECN_NOT_ECT = 0, INET_ECN_ECT_1 = 1, INET_ECN_ECT_0 = 2, INET_ECN_CE = 3, INET_ECN_MASK = 3, }; extern int sysctl_tunnel_ecn_log; static inline int INET_ECN_is_ce(__u8 dsfield) { return (dsfield & INET_ECN_MASK) == INET_ECN_CE; } static inline int INET_ECN_is_not_ect(__u8 dsfield) { return (dsfield & INET_ECN_MASK) == INET_ECN_NOT_ECT; } static inline int INET_ECN_is_capable(__u8 dsfield) { return dsfield & INET_ECN_ECT_0; } /* * RFC 3168 9.1.1 * The full-functionality option for ECN encapsulation is to copy the * ECN codepoint of the inside header to the outside header on * encapsulation if the inside header is not-ECT or ECT, and to set the * ECN codepoint of the outside header to ECT(0) if the ECN codepoint of * the inside header is CE. */ static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner) { outer &= ~INET_ECN_MASK; outer |= !INET_ECN_is_ce(inner) ? (inner & INET_ECN_MASK) : INET_ECN_ECT_0; return outer; } static inline void INET_ECN_xmit(struct sock *sk) { inet_sk(sk)->tos |= INET_ECN_ECT_0; if (inet6_sk(sk) != NULL) inet6_sk(sk)->tclass |= INET_ECN_ECT_0; } static inline void INET_ECN_dontxmit(struct sock *sk) { inet_sk(sk)->tos &= ~INET_ECN_MASK; if (inet6_sk(sk) != NULL) inet6_sk(sk)->tclass &= ~INET_ECN_MASK; } #define IP6_ECN_flow_init(label) do { \ (label) &= ~htonl(INET_ECN_MASK << 20); \ } while (0) #define IP6_ECN_flow_xmit(sk, label) do { \ if (INET_ECN_is_capable(inet6_sk(sk)->tclass)) \ (label) |= htonl(INET_ECN_ECT_0 << 20); \ } while (0) static inline int IP_ECN_set_ce(struct iphdr *iph) { u32 ecn = (iph->tos + 1) & INET_ECN_MASK; __be16 check_add; /* * After the last operation we have (in binary): * INET_ECN_NOT_ECT => 01 * INET_ECN_ECT_1 => 10 * INET_ECN_ECT_0 => 11 * INET_ECN_CE => 00 */ if (!(ecn & 2)) return !ecn; /* * The following gives us: * INET_ECN_ECT_1 => check += htons(0xFFFD) * INET_ECN_ECT_0 => check += htons(0xFFFE) */ check_add = (__force __be16)((__force u16)htons(0xFFFB) + (__force u16)htons(ecn)); iph->check = csum16_add(iph->check, check_add); iph->tos |= INET_ECN_CE; return 1; } static inline int IP_ECN_set_ect1(struct iphdr *iph) { if ((iph->tos & INET_ECN_MASK) != INET_ECN_ECT_0) return 0; iph->check = csum16_add(iph->check, htons(0x1)); iph->tos ^= INET_ECN_MASK; return 1; } static inline void IP_ECN_clear(struct iphdr *iph) { iph->tos &= ~INET_ECN_MASK; } static inline void ipv4_copy_dscp(unsigned int dscp, struct iphdr *inner) { dscp &= ~INET_ECN_MASK; ipv4_change_dsfield(inner, INET_ECN_MASK, dscp); } struct ipv6hdr; /* Note: * IP_ECN_set_ce() has to tweak IPV4 checksum when setting CE, * meaning both changes have no effect on skb->csum if/when CHECKSUM_COMPLETE * In IPv6 case, no checksum compensates the change in IPv6 header, * so we have to update skb->csum. */ static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph) { __be32 from, to; if (INET_ECN_is_not_ect(ipv6_get_dsfield(iph))) return 0; from = *(__be32 *)iph; to = from | htonl(INET_ECN_CE << 20); *(__be32 *)iph = to; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from), (__force __wsum)to); return 1; } static inline int IP6_ECN_set_ect1(struct sk_buff *skb, struct ipv6hdr *iph) { __be32 from, to; if ((ipv6_get_dsfield(iph) & INET_ECN_MASK) != INET_ECN_ECT_0) return 0; from = *(__be32 *)iph; to = from ^ htonl(INET_ECN_MASK << 20); *(__be32 *)iph = to; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from), (__force __wsum)to); return 1; } static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner) { dscp &= ~INET_ECN_MASK; ipv6_change_dsfield(inner, INET_ECN_MASK, dscp); } static inline int INET_ECN_set_ce(struct sk_buff *skb) { switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) return IP_ECN_set_ce(ip_hdr(skb)); break; case cpu_to_be16(ETH_P_IPV6): if (skb_network_header(skb) + sizeof(struct ipv6hdr) <= skb_tail_pointer(skb)) return IP6_ECN_set_ce(skb, ipv6_hdr(skb)); break; } return 0; } static inline int skb_get_dsfield(struct sk_buff *skb) { switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) break; return ipv4_get_dsfield(ip_hdr(skb)); case cpu_to_be16(ETH_P_IPV6): if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) break; return ipv6_get_dsfield(ipv6_hdr(skb)); } return -1; } static inline int INET_ECN_set_ect1(struct sk_buff *skb) { switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) return IP_ECN_set_ect1(ip_hdr(skb)); break; case cpu_to_be16(ETH_P_IPV6): if (skb_network_header(skb) + sizeof(struct ipv6hdr) <= skb_tail_pointer(skb)) return IP6_ECN_set_ect1(skb, ipv6_hdr(skb)); break; } return 0; } /* * RFC 6040 4.2 * To decapsulate the inner header at the tunnel egress, a compliant * tunnel egress MUST set the outgoing ECN field to the codepoint at the * intersection of the appropriate arriving inner header (row) and outer * header (column) in Figure 4 * * +---------+------------------------------------------------+ * |Arriving | Arriving Outer Header | * | Inner +---------+------------+------------+------------+ * | Header | Not-ECT | ECT(0) | ECT(1) | CE | * +---------+---------+------------+------------+------------+ * | Not-ECT | Not-ECT |Not-ECT(!!!)|Not-ECT(!!!)| <drop>(!!!)| * | ECT(0) | ECT(0) | ECT(0) | ECT(1) | CE | * | ECT(1) | ECT(1) | ECT(1) (!) | ECT(1) | CE | * | CE | CE | CE | CE(!!!)| CE | * +---------+---------+------------+------------+------------+ * * Figure 4: New IP in IP Decapsulation Behaviour * * returns 0 on success * 1 if something is broken and should be logged (!!! above) * 2 if packet should be dropped */ static inline int __INET_ECN_decapsulate(__u8 outer, __u8 inner, bool *set_ce) { if (INET_ECN_is_not_ect(inner)) { switch (outer & INET_ECN_MASK) { case INET_ECN_NOT_ECT: return 0; case INET_ECN_ECT_0: case INET_ECN_ECT_1: return 1; case INET_ECN_CE: return 2; } } *set_ce = INET_ECN_is_ce(outer); return 0; } static inline int INET_ECN_decapsulate(struct sk_buff *skb, __u8 outer, __u8 inner) { bool set_ce = false; int rc; rc = __INET_ECN_decapsulate(outer, inner, &set_ce); if (!rc) { if (set_ce) INET_ECN_set_ce(skb); else if ((outer & INET_ECN_MASK) == INET_ECN_ECT_1) INET_ECN_set_ect1(skb); } return rc; } static inline int IP_ECN_decapsulate(const struct iphdr *oiph, struct sk_buff *skb) { __u8 inner; switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; break; case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); break; default: return 0; } return INET_ECN_decapsulate(skb, oiph->tos, inner); } static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h, struct sk_buff *skb) { __u8 inner; switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; break; case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); break; default: return 0; } return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner); } #endif
31 84 84 84 6 6 3 1 3 4 1 1 27 31 35 35 34 31 31 31 31 40 40 175 6 249 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 // SPDX-License-Identifier: GPL-2.0-only #include <linux/fs.h> #include <linux/xattr.h> #include "overlayfs.h" static bool ovl_is_escaped_xattr(struct super_block *sb, const char *name) { struct ovl_fs *ofs = sb->s_fs_info; if (ofs->config.userxattr) return strncmp(name, OVL_XATTR_ESCAPE_USER_PREFIX, OVL_XATTR_ESCAPE_USER_PREFIX_LEN) == 0; else return strncmp(name, OVL_XATTR_ESCAPE_TRUSTED_PREFIX, OVL_XATTR_ESCAPE_TRUSTED_PREFIX_LEN - 1) == 0; } static bool ovl_is_own_xattr(struct super_block *sb, const char *name) { struct ovl_fs *ofs = OVL_FS(sb); if (ofs->config.userxattr) return strncmp(name, OVL_XATTR_USER_PREFIX, OVL_XATTR_USER_PREFIX_LEN) == 0; else return strncmp(name, OVL_XATTR_TRUSTED_PREFIX, OVL_XATTR_TRUSTED_PREFIX_LEN) == 0; } bool ovl_is_private_xattr(struct super_block *sb, const char *name) { return ovl_is_own_xattr(sb, name) && !ovl_is_escaped_xattr(sb, name); } static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { int err; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *upperdentry = ovl_i_dentry_upper(inode); struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); struct path realpath; const struct cred *old_cred; if (!value && !upperdentry) { ovl_path_lower(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0); revert_creds(old_cred); if (err < 0) goto out; } if (!upperdentry) { err = ovl_copy_up(dentry); if (err) goto out; realdentry = ovl_dentry_upper(dentry); } err = ovl_want_write(dentry); if (err) goto out; old_cred = ovl_override_creds(dentry->d_sb); if (value) { err = ovl_do_setxattr(ofs, realdentry, name, value, size, flags); } else { WARN_ON(flags != XATTR_REPLACE); err = ovl_do_removexattr(ofs, realdentry, name); } revert_creds(old_cred); ovl_drop_write(dentry); /* copy c/mtime */ ovl_copyattr(inode); out: return err; } static int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { ssize_t res; const struct cred *old_cred; struct path realpath; ovl_i_path_real(inode, &realpath); old_cred = ovl_override_creds(dentry->d_sb); res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size); revert_creds(old_cred); return res; } static bool ovl_can_list(struct super_block *sb, const char *s) { /* Never list private (.overlay) */ if (ovl_is_private_xattr(sb, s)) return false; /* List all non-trusted xattrs */ if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) return true; /* list other trusted for superuser only */ return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); } ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) { struct dentry *realdentry = ovl_dentry_real(dentry); struct ovl_fs *ofs = OVL_FS(dentry->d_sb); ssize_t res; size_t len; char *s; const struct cred *old_cred; size_t prefix_len, name_len; old_cred = ovl_override_creds(dentry->d_sb); res = vfs_listxattr(realdentry, list, size); revert_creds(old_cred); if (res <= 0 || size == 0) return res; prefix_len = ofs->config.userxattr ? OVL_XATTR_USER_PREFIX_LEN : OVL_XATTR_TRUSTED_PREFIX_LEN; /* filter out private xattrs */ for (s = list, len = res; len;) { size_t slen = strnlen(s, len) + 1; /* underlying fs providing us with an broken xattr list? */ if (WARN_ON(slen > len)) return -EIO; len -= slen; if (!ovl_can_list(dentry->d_sb, s)) { res -= slen; memmove(s, s + slen, len); } else if (ovl_is_escaped_xattr(dentry->d_sb, s)) { res -= OVL_XATTR_ESCAPE_PREFIX_LEN; name_len = slen - prefix_len - OVL_XATTR_ESCAPE_PREFIX_LEN; s += prefix_len; memmove(s, s + OVL_XATTR_ESCAPE_PREFIX_LEN, name_len + len); s += name_len; } else { s += slen; } } return res; } static char *ovl_xattr_escape_name(const char *prefix, const char *name) { size_t prefix_len = strlen(prefix); size_t name_len = strlen(name); size_t escaped_len; char *escaped, *s; escaped_len = prefix_len + OVL_XATTR_ESCAPE_PREFIX_LEN + name_len; if (escaped_len > XATTR_NAME_MAX) return ERR_PTR(-EOPNOTSUPP); escaped = kmalloc(escaped_len + 1, GFP_KERNEL); if (escaped == NULL) return ERR_PTR(-ENOMEM); s = escaped; memcpy(s, prefix, prefix_len); s += prefix_len; memcpy(s, OVL_XATTR_ESCAPE_PREFIX, OVL_XATTR_ESCAPE_PREFIX_LEN); s += OVL_XATTR_ESCAPE_PREFIX_LEN; memcpy(s, name, name_len + 1); return escaped; } static int ovl_own_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { char *escaped; int r; escaped = ovl_xattr_escape_name(handler->prefix, name); if (IS_ERR(escaped)) return PTR_ERR(escaped); r = ovl_xattr_get(dentry, inode, escaped, buffer, size); kfree(escaped); return r; } static int ovl_own_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { char *escaped; int r; escaped = ovl_xattr_escape_name(handler->prefix, name); if (IS_ERR(escaped)) return PTR_ERR(escaped); r = ovl_xattr_set(dentry, inode, escaped, value, size, flags); kfree(escaped); return r; } static int ovl_other_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { return ovl_xattr_get(dentry, inode, name, buffer, size); } static int ovl_other_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { return ovl_xattr_set(dentry, inode, name, value, size, flags); } static const struct xattr_handler ovl_own_trusted_xattr_handler = { .prefix = OVL_XATTR_TRUSTED_PREFIX, .get = ovl_own_xattr_get, .set = ovl_own_xattr_set, }; static const struct xattr_handler ovl_own_user_xattr_handler = { .prefix = OVL_XATTR_USER_PREFIX, .get = ovl_own_xattr_get, .set = ovl_own_xattr_set, }; static const struct xattr_handler ovl_other_xattr_handler = { .prefix = "", /* catch all */ .get = ovl_other_xattr_get, .set = ovl_other_xattr_set, }; static const struct xattr_handler * const ovl_trusted_xattr_handlers[] = { &ovl_own_trusted_xattr_handler, &ovl_other_xattr_handler, NULL }; static const struct xattr_handler * const ovl_user_xattr_handlers[] = { &ovl_own_user_xattr_handler, &ovl_other_xattr_handler, NULL }; const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs) { return ofs->config.userxattr ? ovl_user_xattr_handlers : ovl_trusted_xattr_handlers; }
11 11 11 11 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* Copyright (C) 2019 Netronome Systems, Inc. */ #ifndef __NET_TC_MPLS_H #define __NET_TC_MPLS_H #include <linux/tc_act/tc_mpls.h> #include <net/act_api.h> struct tcf_mpls_params { int tcfm_action; u32 tcfm_label; u8 tcfm_tc; u8 tcfm_ttl; u8 tcfm_bos; __be16 tcfm_proto; struct rcu_head rcu; }; #define ACT_MPLS_TC_NOT_SET 0xff #define ACT_MPLS_BOS_NOT_SET 0xff #define ACT_MPLS_LABEL_NOT_SET 0xffffffff struct tcf_mpls { struct tc_action common; struct tcf_mpls_params __rcu *mpls_p; }; #define to_mpls(a) ((struct tcf_mpls *)a) static inline bool is_tcf_mpls(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->id == TCA_ID_MPLS) return true; #endif return false; } static inline u32 tcf_mpls_action(const struct tc_action *a) { u32 tcfm_action; rcu_read_lock(); tcfm_action = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_action; rcu_read_unlock(); return tcfm_action; } static inline __be16 tcf_mpls_proto(const struct tc_action *a) { __be16 tcfm_proto; rcu_read_lock(); tcfm_proto = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_proto; rcu_read_unlock(); return tcfm_proto; } static inline u32 tcf_mpls_label(const struct tc_action *a) { u32 tcfm_label; rcu_read_lock(); tcfm_label = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_label; rcu_read_unlock(); return tcfm_label; } static inline u8 tcf_mpls_tc(const struct tc_action *a) { u8 tcfm_tc; rcu_read_lock(); tcfm_tc = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_tc; rcu_read_unlock(); return tcfm_tc; } static inline u8 tcf_mpls_bos(const struct tc_action *a) { u8 tcfm_bos; rcu_read_lock(); tcfm_bos = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_bos; rcu_read_unlock(); return tcfm_bos; } static inline u8 tcf_mpls_ttl(const struct tc_action *a) { u8 tcfm_ttl; rcu_read_lock(); tcfm_ttl = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_ttl; rcu_read_unlock(); return tcfm_ttl; } #endif /* __NET_TC_MPLS_H */
156 2 180 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions of the Internet Protocol. * * Version: @(#)in.h 1.0.1 04/21/93 * * Authors: Original taken from the GNU Project <netinet/in.h> file. * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_IN_H #define _LINUX_IN_H #include <linux/errno.h> #include <uapi/linux/in.h> static inline int proto_ports_offset(int proto) { switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_DCCP: case IPPROTO_ESP: /* SPI */ case IPPROTO_SCTP: case IPPROTO_UDPLITE: return 0; case IPPROTO_AH: /* SPI */ return 4; default: return -EINVAL; } } static inline bool ipv4_is_loopback(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x7f000000); } static inline bool ipv4_is_multicast(__be32 addr) { return (addr & htonl(0xf0000000)) == htonl(0xe0000000); } static inline bool ipv4_is_local_multicast(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xe0000000); } static inline bool ipv4_is_lbcast(__be32 addr) { /* limited broadcast */ return addr == htonl(INADDR_BROADCAST); } static inline bool ipv4_is_all_snoopers(__be32 addr) { return addr == htonl(INADDR_ALLSNOOPERS_GROUP); } static inline bool ipv4_is_zeronet(__be32 addr) { return (addr == 0); } /* Special-Use IPv4 Addresses (RFC3330) */ static inline bool ipv4_is_private_10(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x0a000000); } static inline bool ipv4_is_private_172(__be32 addr) { return (addr & htonl(0xfff00000)) == htonl(0xac100000); } static inline bool ipv4_is_private_192(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xc0a80000); } static inline bool ipv4_is_linklocal_169(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xa9fe0000); } static inline bool ipv4_is_anycast_6to4(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0586300); } static inline bool ipv4_is_test_192(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0000200); } static inline bool ipv4_is_test_198(__be32 addr) { return (addr & htonl(0xfffe0000)) == htonl(0xc6120000); } #endif /* _LINUX_IN_H */
3 3 2 2 2 2 87 1 1 1 1 1952 1952 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 // SPDX-License-Identifier: GPL-2.0 /* drivers/net/wireless/virt_wifi.c * * A fake implementation of cfg80211_ops that can be tacked on to an ethernet * net_device to make it appear as a wireless connection. * * Copyright (C) 2018 Google, Inc. * * Author: schuffelen@google.com */ #include <net/cfg80211.h> #include <net/rtnetlink.h> #include <linux/etherdevice.h> #include <linux/math64.h> #include <linux/module.h> static struct wiphy *common_wiphy; struct virt_wifi_wiphy_priv { struct delayed_work scan_result; struct cfg80211_scan_request *scan_request; bool being_deleted; }; static struct ieee80211_channel channel_2ghz = { .band = NL80211_BAND_2GHZ, .center_freq = 2432, .hw_value = 2432, .max_power = 20, }; static struct ieee80211_rate bitrates_2ghz[] = { { .bitrate = 10 }, { .bitrate = 20 }, { .bitrate = 55 }, { .bitrate = 110 }, { .bitrate = 60 }, { .bitrate = 120 }, { .bitrate = 240 }, }; static struct ieee80211_supported_band band_2ghz = { .channels = &channel_2ghz, .bitrates = bitrates_2ghz, .band = NL80211_BAND_2GHZ, .n_channels = 1, .n_bitrates = ARRAY_SIZE(bitrates_2ghz), .ht_cap = { .ht_supported = true, .cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 | IEEE80211_HT_CAP_GRN_FLD | IEEE80211_HT_CAP_SGI_20 | IEEE80211_HT_CAP_SGI_40 | IEEE80211_HT_CAP_DSSSCCK40, .ampdu_factor = 0x3, .ampdu_density = 0x6, .mcs = { .rx_mask = {0xff, 0xff}, .tx_params = IEEE80211_HT_MCS_TX_DEFINED, }, }, }; static struct ieee80211_channel channel_5ghz = { .band = NL80211_BAND_5GHZ, .center_freq = 5240, .hw_value = 5240, .max_power = 20, }; static struct ieee80211_rate bitrates_5ghz[] = { { .bitrate = 60 }, { .bitrate = 120 }, { .bitrate = 240 }, }; #define RX_MCS_MAP (IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 6 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 8 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 10 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 12 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 14) #define TX_MCS_MAP (IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 6 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 8 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 10 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 12 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 14) static struct ieee80211_supported_band band_5ghz = { .channels = &channel_5ghz, .bitrates = bitrates_5ghz, .band = NL80211_BAND_5GHZ, .n_channels = 1, .n_bitrates = ARRAY_SIZE(bitrates_5ghz), .ht_cap = { .ht_supported = true, .cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 | IEEE80211_HT_CAP_GRN_FLD | IEEE80211_HT_CAP_SGI_20 | IEEE80211_HT_CAP_SGI_40 | IEEE80211_HT_CAP_DSSSCCK40, .ampdu_factor = 0x3, .ampdu_density = 0x6, .mcs = { .rx_mask = {0xff, 0xff}, .tx_params = IEEE80211_HT_MCS_TX_DEFINED, }, }, .vht_cap = { .vht_supported = true, .cap = IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 | IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ | IEEE80211_VHT_CAP_RXLDPC | IEEE80211_VHT_CAP_SHORT_GI_80 | IEEE80211_VHT_CAP_SHORT_GI_160 | IEEE80211_VHT_CAP_TXSTBC | IEEE80211_VHT_CAP_RXSTBC_1 | IEEE80211_VHT_CAP_RXSTBC_2 | IEEE80211_VHT_CAP_RXSTBC_3 | IEEE80211_VHT_CAP_RXSTBC_4 | IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK, .vht_mcs = { .rx_mcs_map = cpu_to_le16(RX_MCS_MAP), .tx_mcs_map = cpu_to_le16(TX_MCS_MAP), } }, }; /* Assigned at module init. Guaranteed locally-administered and unicast. */ static u8 fake_router_bssid[ETH_ALEN] __ro_after_init = {}; static void virt_wifi_inform_bss(struct wiphy *wiphy) { u64 tsf = div_u64(ktime_get_boottime_ns(), 1000); struct cfg80211_bss *informed_bss; static const struct { u8 tag; u8 len; u8 ssid[8]; } __packed ssid = { .tag = WLAN_EID_SSID, .len = 8, .ssid = "VirtWifi", }; informed_bss = cfg80211_inform_bss(wiphy, &channel_5ghz, CFG80211_BSS_FTYPE_PRESP, fake_router_bssid, tsf, WLAN_CAPABILITY_ESS, 0, (void *)&ssid, sizeof(ssid), DBM_TO_MBM(-50), GFP_KERNEL); cfg80211_put_bss(wiphy, informed_bss); } /* Called with the rtnl lock held. */ static int virt_wifi_scan(struct wiphy *wiphy, struct cfg80211_scan_request *request) { struct virt_wifi_wiphy_priv *priv = wiphy_priv(wiphy); wiphy_debug(wiphy, "scan\n"); if (priv->scan_request || priv->being_deleted) return -EBUSY; priv->scan_request = request; schedule_delayed_work(&priv->scan_result, HZ * 2); return 0; } /* Acquires and releases the rdev BSS lock. */ static void virt_wifi_scan_result(struct work_struct *work) { struct virt_wifi_wiphy_priv *priv = container_of(work, struct virt_wifi_wiphy_priv, scan_result.work); struct wiphy *wiphy = priv_to_wiphy(priv); struct cfg80211_scan_info scan_info = { .aborted = false }; virt_wifi_inform_bss(wiphy); /* Schedules work which acquires and releases the rtnl lock. */ cfg80211_scan_done(priv->scan_request, &scan_info); priv->scan_request = NULL; } /* May acquire and release the rdev BSS lock. */ static void virt_wifi_cancel_scan(struct wiphy *wiphy) { struct virt_wifi_wiphy_priv *priv = wiphy_priv(wiphy); cancel_delayed_work_sync(&priv->scan_result); /* Clean up dangling callbacks if necessary. */ if (priv->scan_request) { struct cfg80211_scan_info scan_info = { .aborted = true }; /* Schedules work which acquires and releases the rtnl lock. */ cfg80211_scan_done(priv->scan_request, &scan_info); priv->scan_request = NULL; } } struct virt_wifi_netdev_priv { struct delayed_work connect; struct net_device *lowerdev; struct net_device *upperdev; u32 tx_packets; u32 tx_failed; u8 connect_requested_bss[ETH_ALEN]; bool is_up; bool is_connected; bool being_deleted; }; /* Called with the rtnl lock held. */ static int virt_wifi_connect(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_connect_params *sme) { struct virt_wifi_netdev_priv *priv = netdev_priv(netdev); bool could_schedule; if (priv->being_deleted || !priv->is_up) return -EBUSY; could_schedule = schedule_delayed_work(&priv->connect, HZ * 2); if (!could_schedule) return -EBUSY; if (sme->bssid) { ether_addr_copy(priv->connect_requested_bss, sme->bssid); } else { virt_wifi_inform_bss(wiphy); eth_zero_addr(priv->connect_requested_bss); } wiphy_debug(wiphy, "connect\n"); return 0; } /* Acquires and releases the rdev event lock. */ static void virt_wifi_connect_complete(struct work_struct *work) { struct virt_wifi_netdev_priv *priv = container_of(work, struct virt_wifi_netdev_priv, connect.work); u8 *requested_bss = priv->connect_requested_bss; bool right_addr = ether_addr_equal(requested_bss, fake_router_bssid); u16 status = WLAN_STATUS_SUCCESS; if (is_zero_ether_addr(requested_bss)) requested_bss = NULL; if (!priv->is_up || (requested_bss && !right_addr)) status = WLAN_STATUS_UNSPECIFIED_FAILURE; else priv->is_connected = true; /* Schedules an event that acquires the rtnl lock. */ cfg80211_connect_result(priv->upperdev, requested_bss, NULL, 0, NULL, 0, status, GFP_KERNEL); netif_carrier_on(priv->upperdev); } /* May acquire and release the rdev event lock. */ static void virt_wifi_cancel_connect(struct net_device *netdev) { struct virt_wifi_netdev_priv *priv = netdev_priv(netdev); /* If there is work pending, clean up dangling callbacks. */ if (cancel_delayed_work_sync(&priv->connect)) { /* Schedules an event that acquires the rtnl lock. */ cfg80211_connect_result(priv->upperdev, priv->connect_requested_bss, NULL, 0, NULL, 0, WLAN_STATUS_UNSPECIFIED_FAILURE, GFP_KERNEL); } } /* Called with the rtnl lock held. Acquires the rdev event lock. */ static int virt_wifi_disconnect(struct wiphy *wiphy, struct net_device *netdev, u16 reason_code) { struct virt_wifi_netdev_priv *priv = netdev_priv(netdev); if (priv->being_deleted) return -EBUSY; wiphy_debug(wiphy, "disconnect\n"); virt_wifi_cancel_connect(netdev); cfg80211_disconnected(netdev, reason_code, NULL, 0, true, GFP_KERNEL); priv->is_connected = false; netif_carrier_off(netdev); return 0; } /* Called with the rtnl lock held. */ static int virt_wifi_get_station(struct wiphy *wiphy, struct net_device *dev, const u8 *mac, struct station_info *sinfo) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); wiphy_debug(wiphy, "get_station\n"); if (!priv->is_connected || !ether_addr_equal(mac, fake_router_bssid)) return -ENOENT; sinfo->filled = BIT_ULL(NL80211_STA_INFO_TX_PACKETS) | BIT_ULL(NL80211_STA_INFO_TX_FAILED) | BIT_ULL(NL80211_STA_INFO_SIGNAL) | BIT_ULL(NL80211_STA_INFO_TX_BITRATE); sinfo->tx_packets = priv->tx_packets; sinfo->tx_failed = priv->tx_failed; /* For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_ */ sinfo->signal = -50; sinfo->txrate = (struct rate_info) { .legacy = 10, /* units are 100kbit/s */ }; return 0; } /* Called with the rtnl lock held. */ static int virt_wifi_dump_station(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *mac, struct station_info *sinfo) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); wiphy_debug(wiphy, "dump_station\n"); if (idx != 0 || !priv->is_connected) return -ENOENT; ether_addr_copy(mac, fake_router_bssid); return virt_wifi_get_station(wiphy, dev, fake_router_bssid, sinfo); } static const struct cfg80211_ops virt_wifi_cfg80211_ops = { .scan = virt_wifi_scan, .connect = virt_wifi_connect, .disconnect = virt_wifi_disconnect, .get_station = virt_wifi_get_station, .dump_station = virt_wifi_dump_station, }; /* Acquires and releases the rtnl lock. */ static struct wiphy *virt_wifi_make_wiphy(void) { struct wiphy *wiphy; struct virt_wifi_wiphy_priv *priv; int err; wiphy = wiphy_new(&virt_wifi_cfg80211_ops, sizeof(*priv)); if (!wiphy) return NULL; wiphy->max_scan_ssids = 4; wiphy->max_scan_ie_len = 1000; wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM; wiphy->bands[NL80211_BAND_2GHZ] = &band_2ghz; wiphy->bands[NL80211_BAND_5GHZ] = &band_5ghz; wiphy->bands[NL80211_BAND_60GHZ] = NULL; wiphy->interface_modes = BIT(NL80211_IFTYPE_STATION); priv = wiphy_priv(wiphy); priv->being_deleted = false; priv->scan_request = NULL; INIT_DELAYED_WORK(&priv->scan_result, virt_wifi_scan_result); err = wiphy_register(wiphy); if (err < 0) { wiphy_free(wiphy); return NULL; } return wiphy; } /* Acquires and releases the rtnl lock. */ static void virt_wifi_destroy_wiphy(struct wiphy *wiphy) { struct virt_wifi_wiphy_priv *priv; WARN(!wiphy, "%s called with null wiphy", __func__); if (!wiphy) return; priv = wiphy_priv(wiphy); priv->being_deleted = true; virt_wifi_cancel_scan(wiphy); if (wiphy->registered) wiphy_unregister(wiphy); wiphy_free(wiphy); } /* Enters and exits a RCU-bh critical section. */ static netdev_tx_t virt_wifi_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); priv->tx_packets++; if (!priv->is_connected) { priv->tx_failed++; return NET_XMIT_DROP; } skb->dev = priv->lowerdev; return dev_queue_xmit(skb); } /* Called with rtnl lock held. */ static int virt_wifi_net_device_open(struct net_device *dev) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); priv->is_up = true; return 0; } /* Called with rtnl lock held. */ static int virt_wifi_net_device_stop(struct net_device *dev) { struct virt_wifi_netdev_priv *n_priv = netdev_priv(dev); n_priv->is_up = false; if (!dev->ieee80211_ptr) return 0; virt_wifi_cancel_scan(dev->ieee80211_ptr->wiphy); virt_wifi_cancel_connect(dev); netif_carrier_off(dev); return 0; } static int virt_wifi_net_device_get_iflink(const struct net_device *dev) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); return priv->lowerdev->ifindex; } static const struct net_device_ops virt_wifi_ops = { .ndo_start_xmit = virt_wifi_start_xmit, .ndo_open = virt_wifi_net_device_open, .ndo_stop = virt_wifi_net_device_stop, .ndo_get_iflink = virt_wifi_net_device_get_iflink, }; /* Invoked as part of rtnl lock release. */ static void virt_wifi_net_device_destructor(struct net_device *dev) { /* Delayed past dellink to allow nl80211 to react to the device being * deleted. */ kfree(dev->ieee80211_ptr); dev->ieee80211_ptr = NULL; } /* No lock interaction. */ static void virt_wifi_setup(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &virt_wifi_ops; dev->needs_free_netdev = true; } /* Called in a RCU read critical section from netif_receive_skb */ static rx_handler_result_t virt_wifi_rx_handler(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct virt_wifi_netdev_priv *priv = rcu_dereference(skb->dev->rx_handler_data); if (!priv->is_connected) return RX_HANDLER_PASS; /* GFP_ATOMIC because this is a packet interrupt handler. */ skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { dev_err(&priv->upperdev->dev, "can't skb_share_check\n"); return RX_HANDLER_CONSUMED; } *pskb = skb; skb->dev = priv->upperdev; skb->pkt_type = PACKET_HOST; return RX_HANDLER_ANOTHER; } /* Called with rtnl lock held. */ static int virt_wifi_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); int err; if (!tb[IFLA_LINK]) return -EINVAL; netif_carrier_off(dev); priv->upperdev = dev; priv->lowerdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); if (!priv->lowerdev) return -ENODEV; if (!tb[IFLA_MTU]) dev->mtu = priv->lowerdev->mtu; else if (dev->mtu > priv->lowerdev->mtu) return -EINVAL; err = netdev_rx_handler_register(priv->lowerdev, virt_wifi_rx_handler, priv); if (err) { dev_err(&priv->lowerdev->dev, "can't netdev_rx_handler_register: %d\n", err); return err; } eth_hw_addr_inherit(dev, priv->lowerdev); netif_stacked_transfer_operstate(priv->lowerdev, dev); SET_NETDEV_DEV(dev, &priv->lowerdev->dev); dev->ieee80211_ptr = kzalloc(sizeof(*dev->ieee80211_ptr), GFP_KERNEL); if (!dev->ieee80211_ptr) { err = -ENOMEM; goto remove_handler; } dev->ieee80211_ptr->iftype = NL80211_IFTYPE_STATION; dev->ieee80211_ptr->wiphy = common_wiphy; err = register_netdevice(dev); if (err) { dev_err(&priv->lowerdev->dev, "can't register_netdevice: %d\n", err); goto free_wireless_dev; } err = netdev_upper_dev_link(priv->lowerdev, dev, extack); if (err) { dev_err(&priv->lowerdev->dev, "can't netdev_upper_dev_link: %d\n", err); goto unregister_netdev; } dev->priv_destructor = virt_wifi_net_device_destructor; priv->being_deleted = false; priv->is_connected = false; priv->is_up = false; INIT_DELAYED_WORK(&priv->connect, virt_wifi_connect_complete); __module_get(THIS_MODULE); return 0; unregister_netdev: unregister_netdevice(dev); free_wireless_dev: kfree(dev->ieee80211_ptr); dev->ieee80211_ptr = NULL; remove_handler: netdev_rx_handler_unregister(priv->lowerdev); return err; } /* Called with rtnl lock held. */ static void virt_wifi_dellink(struct net_device *dev, struct list_head *head) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); if (dev->ieee80211_ptr) virt_wifi_cancel_scan(dev->ieee80211_ptr->wiphy); priv->being_deleted = true; virt_wifi_cancel_connect(dev); netif_carrier_off(dev); netdev_rx_handler_unregister(priv->lowerdev); netdev_upper_dev_unlink(priv->lowerdev, dev); unregister_netdevice_queue(dev, head); module_put(THIS_MODULE); /* Deleting the wiphy is handled in the module destructor. */ } static struct rtnl_link_ops virt_wifi_link_ops = { .kind = "virt_wifi", .setup = virt_wifi_setup, .newlink = virt_wifi_newlink, .dellink = virt_wifi_dellink, .priv_size = sizeof(struct virt_wifi_netdev_priv), }; static bool netif_is_virt_wifi_dev(const struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == virt_wifi_rx_handler; } static int virt_wifi_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *lower_dev = netdev_notifier_info_to_dev(ptr); struct virt_wifi_netdev_priv *priv; struct net_device *upper_dev; LIST_HEAD(list_kill); if (!netif_is_virt_wifi_dev(lower_dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UNREGISTER: priv = rtnl_dereference(lower_dev->rx_handler_data); if (!priv) return NOTIFY_DONE; upper_dev = priv->upperdev; upper_dev->rtnl_link_ops->dellink(upper_dev, &list_kill); unregister_netdevice_many(&list_kill); break; } return NOTIFY_DONE; } static struct notifier_block virt_wifi_notifier = { .notifier_call = virt_wifi_event, }; /* Acquires and releases the rtnl lock. */ static int __init virt_wifi_init_module(void) { int err; /* Guaranteed to be locally-administered and not multicast. */ eth_random_addr(fake_router_bssid); err = register_netdevice_notifier(&virt_wifi_notifier); if (err) return err; err = -ENOMEM; common_wiphy = virt_wifi_make_wiphy(); if (!common_wiphy) goto notifier; err = rtnl_link_register(&virt_wifi_link_ops); if (err) goto destroy_wiphy; return 0; destroy_wiphy: virt_wifi_destroy_wiphy(common_wiphy); notifier: unregister_netdevice_notifier(&virt_wifi_notifier); return err; } /* Acquires and releases the rtnl lock. */ static void __exit virt_wifi_cleanup_module(void) { /* Will delete any devices that depend on the wiphy. */ rtnl_link_unregister(&virt_wifi_link_ops); virt_wifi_destroy_wiphy(common_wiphy); unregister_netdevice_notifier(&virt_wifi_notifier); } module_init(virt_wifi_init_module); module_exit(virt_wifi_cleanup_module); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Cody Schuffelen <schuffelen@google.com>"); MODULE_DESCRIPTION("Driver for a wireless wrapper of ethernet devices"); MODULE_ALIAS_RTNL_LINK("virt_wifi");
2 2 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 // SPDX-License-Identifier: GPL-2.0+ /* * Generic USB driver for report based interrupt in/out devices * like LD Didactic's USB devices. LD Didactic's USB devices are * HID devices which do not use HID report definitons (they use * raw interrupt in and our reports only for communication). * * This driver uses a ring buffer for time critical reading of * interrupt in reports and provides read and write methods for * raw interrupt reports (similar to the Windows HID driver). * Devices based on the book USB COMPLETE by Jan Axelson may need * such a compatibility to the Windows HID driver. * * Copyright (C) 2005 Michael Hund <mhund@ld-didactic.de> * * Derived from Lego USB Tower driver * Copyright (C) 2003 David Glance <advidgsf@sourceforge.net> * 2001-2004 Juergen Stuber <starblue@users.sourceforge.net> */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/uaccess.h> #include <linux/input.h> #include <linux/usb.h> #include <linux/poll.h> /* Define these values to match your devices */ #define USB_VENDOR_ID_LD 0x0f11 /* USB Vendor ID of LD Didactic GmbH */ #define USB_DEVICE_ID_LD_CASSY 0x1000 /* USB Product ID of CASSY-S modules with 8 bytes endpoint size */ #define USB_DEVICE_ID_LD_CASSY2 0x1001 /* USB Product ID of CASSY-S modules with 64 bytes endpoint size */ #define USB_DEVICE_ID_LD_POCKETCASSY 0x1010 /* USB Product ID of Pocket-CASSY */ #define USB_DEVICE_ID_LD_POCKETCASSY2 0x1011 /* USB Product ID of Pocket-CASSY 2 (reserved) */ #define USB_DEVICE_ID_LD_MOBILECASSY 0x1020 /* USB Product ID of Mobile-CASSY */ #define USB_DEVICE_ID_LD_MOBILECASSY2 0x1021 /* USB Product ID of Mobile-CASSY 2 (reserved) */ #define USB_DEVICE_ID_LD_MICROCASSYVOLTAGE 0x1031 /* USB Product ID of Micro-CASSY Voltage */ #define USB_DEVICE_ID_LD_MICROCASSYCURRENT 0x1032 /* USB Product ID of Micro-CASSY Current */ #define USB_DEVICE_ID_LD_MICROCASSYTIME 0x1033 /* USB Product ID of Micro-CASSY Time (reserved) */ #define USB_DEVICE_ID_LD_MICROCASSYTEMPERATURE 0x1035 /* USB Product ID of Micro-CASSY Temperature */ #define USB_DEVICE_ID_LD_MICROCASSYPH 0x1038 /* USB Product ID of Micro-CASSY pH */ #define USB_DEVICE_ID_LD_POWERANALYSERCASSY 0x1040 /* USB Product ID of Power Analyser CASSY */ #define USB_DEVICE_ID_LD_CONVERTERCONTROLLERCASSY 0x1042 /* USB Product ID of Converter Controller CASSY */ #define USB_DEVICE_ID_LD_MACHINETESTCASSY 0x1043 /* USB Product ID of Machine Test CASSY */ #define USB_DEVICE_ID_LD_JWM 0x1080 /* USB Product ID of Joule and Wattmeter */ #define USB_DEVICE_ID_LD_DMMP 0x1081 /* USB Product ID of Digital Multimeter P (reserved) */ #define USB_DEVICE_ID_LD_UMIP 0x1090 /* USB Product ID of UMI P */ #define USB_DEVICE_ID_LD_UMIC 0x10A0 /* USB Product ID of UMI C */ #define USB_DEVICE_ID_LD_UMIB 0x10B0 /* USB Product ID of UMI B */ #define USB_DEVICE_ID_LD_XRAY 0x1100 /* USB Product ID of X-Ray Apparatus 55481 */ #define USB_DEVICE_ID_LD_XRAY2 0x1101 /* USB Product ID of X-Ray Apparatus 554800 */ #define USB_DEVICE_ID_LD_XRAYCT 0x1110 /* USB Product ID of X-Ray Apparatus CT 554821*/ #define USB_DEVICE_ID_LD_VIDEOCOM 0x1200 /* USB Product ID of VideoCom */ #define USB_DEVICE_ID_LD_MOTOR 0x1210 /* USB Product ID of Motor (reserved) */ #define USB_DEVICE_ID_LD_COM3LAB 0x2000 /* USB Product ID of COM3LAB */ #define USB_DEVICE_ID_LD_TELEPORT 0x2010 /* USB Product ID of Terminal Adapter */ #define USB_DEVICE_ID_LD_NETWORKANALYSER 0x2020 /* USB Product ID of Network Analyser */ #define USB_DEVICE_ID_LD_POWERCONTROL 0x2030 /* USB Product ID of Converter Control Unit */ #define USB_DEVICE_ID_LD_MACHINETEST 0x2040 /* USB Product ID of Machine Test System */ #define USB_DEVICE_ID_LD_MOSTANALYSER 0x2050 /* USB Product ID of MOST Protocol Analyser */ #define USB_DEVICE_ID_LD_MOSTANALYSER2 0x2051 /* USB Product ID of MOST Protocol Analyser 2 */ #define USB_DEVICE_ID_LD_ABSESP 0x2060 /* USB Product ID of ABS ESP */ #define USB_DEVICE_ID_LD_AUTODATABUS 0x2070 /* USB Product ID of Automotive Data Buses */ #define USB_DEVICE_ID_LD_MCT 0x2080 /* USB Product ID of Microcontroller technique */ #define USB_DEVICE_ID_LD_HYBRID 0x2090 /* USB Product ID of Automotive Hybrid */ #define USB_DEVICE_ID_LD_HEATCONTROL 0x20A0 /* USB Product ID of Heat control */ #ifdef CONFIG_USB_DYNAMIC_MINORS #define USB_LD_MINOR_BASE 0 #else #define USB_LD_MINOR_BASE 176 #endif /* table of devices that work with this driver */ static const struct usb_device_id ld_usb_table[] = { { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_CASSY) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_CASSY2) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_POCKETCASSY) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_POCKETCASSY2) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_MOBILECASSY) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_MOBILECASSY2) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_MICROCASSYVOLTAGE) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_MICROCASSYCURRENT) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_MICROCASSYTIME) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_MICROCASSYTEMPERATURE) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_MICROCASSYPH) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_POWERANALYSERCASSY) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_CONVERTERCONTROLLERCASSY) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_MACHINETESTCASSY) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_JWM) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_DMMP) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_UMIP) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_UMIC) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_UMIB) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_XRAY) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_XRAY2) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_VIDEOCOM) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_MOTOR) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_COM3LAB) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_TELEPORT) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_NETWORKANALYSER) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_POWERCONTROL) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_MACHINETEST) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_MOSTANALYSER) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_MOSTANALYSER2) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_ABSESP) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_AUTODATABUS) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_MCT) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_HYBRID) }, { USB_DEVICE(USB_VENDOR_ID_LD, USB_DEVICE_ID_LD_HEATCONTROL) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, ld_usb_table); MODULE_AUTHOR("Michael Hund <mhund@ld-didactic.de>"); MODULE_DESCRIPTION("LD USB Driver"); MODULE_LICENSE("GPL"); /* All interrupt in transfers are collected in a ring buffer to * avoid racing conditions and get better performance of the driver. */ static int ring_buffer_size = 128; module_param(ring_buffer_size, int, 0000); MODULE_PARM_DESC(ring_buffer_size, "Read ring buffer size in reports"); /* The write_buffer can contain more than one interrupt out transfer. */ static int write_buffer_size = 10; module_param(write_buffer_size, int, 0000); MODULE_PARM_DESC(write_buffer_size, "Write buffer size in reports"); /* As of kernel version 2.6.4 ehci-hcd uses an * "only one interrupt transfer per frame" shortcut * to simplify the scheduling of periodic transfers. * This conflicts with our standard 1ms intervals for in and out URBs. * We use default intervals of 2ms for in and 2ms for out transfers, * which should be fast enough. * Increase the interval to allow more devices that do interrupt transfers, * or set to 1 to use the standard interval from the endpoint descriptors. */ static int min_interrupt_in_interval = 2; module_param(min_interrupt_in_interval, int, 0000); MODULE_PARM_DESC(min_interrupt_in_interval, "Minimum interrupt in interval in ms"); static int min_interrupt_out_interval = 2; module_param(min_interrupt_out_interval, int, 0000); MODULE_PARM_DESC(min_interrupt_out_interval, "Minimum interrupt out interval in ms"); /* Structure to hold all of our device specific stuff */ struct ld_usb { struct mutex mutex; /* locks this structure */ struct usb_interface *intf; /* save off the usb interface pointer */ unsigned long disconnected:1; int open_count; /* number of times this port has been opened */ char *ring_buffer; unsigned int ring_head; unsigned int ring_tail; wait_queue_head_t read_wait; wait_queue_head_t write_wait; char *interrupt_in_buffer; struct usb_endpoint_descriptor *interrupt_in_endpoint; struct urb *interrupt_in_urb; int interrupt_in_interval; size_t interrupt_in_endpoint_size; int interrupt_in_running; int interrupt_in_done; int buffer_overflow; spinlock_t rbsl; char *interrupt_out_buffer; struct usb_endpoint_descriptor *interrupt_out_endpoint; struct urb *interrupt_out_urb; int interrupt_out_interval; size_t interrupt_out_endpoint_size; int interrupt_out_busy; }; static struct usb_driver ld_usb_driver; /* * ld_usb_abort_transfers * aborts transfers and frees associated data structures */ static void ld_usb_abort_transfers(struct ld_usb *dev) { /* shutdown transfer */ if (dev->interrupt_in_running) { dev->interrupt_in_running = 0; usb_kill_urb(dev->interrupt_in_urb); } if (dev->interrupt_out_busy) usb_kill_urb(dev->interrupt_out_urb); } /* * ld_usb_delete */ static void ld_usb_delete(struct ld_usb *dev) { /* free data structures */ usb_free_urb(dev->interrupt_in_urb); usb_free_urb(dev->interrupt_out_urb); kfree(dev->ring_buffer); kfree(dev->interrupt_in_buffer); kfree(dev->interrupt_out_buffer); kfree(dev); } /* * ld_usb_interrupt_in_callback */ static void ld_usb_interrupt_in_callback(struct urb *urb) { struct ld_usb *dev = urb->context; size_t *actual_buffer; unsigned int next_ring_head; int status = urb->status; unsigned long flags; int retval; if (status) { if (status == -ENOENT || status == -ECONNRESET || status == -ESHUTDOWN) { goto exit; } else { dev_dbg(&dev->intf->dev, "%s: nonzero status received: %d\n", __func__, status); spin_lock_irqsave(&dev->rbsl, flags); goto resubmit; /* maybe we can recover */ } } spin_lock_irqsave(&dev->rbsl, flags); if (urb->actual_length > 0) { next_ring_head = (dev->ring_head+1) % ring_buffer_size; if (next_ring_head != dev->ring_tail) { actual_buffer = (size_t *)(dev->ring_buffer + dev->ring_head * (sizeof(size_t)+dev->interrupt_in_endpoint_size)); /* actual_buffer gets urb->actual_length + interrupt_in_buffer */ *actual_buffer = urb->actual_length; memcpy(actual_buffer+1, dev->interrupt_in_buffer, urb->actual_length); dev->ring_head = next_ring_head; dev_dbg(&dev->intf->dev, "%s: received %d bytes\n", __func__, urb->actual_length); } else { dev_warn(&dev->intf->dev, "Ring buffer overflow, %d bytes dropped\n", urb->actual_length); dev->buffer_overflow = 1; } } resubmit: /* resubmit if we're still running */ if (dev->interrupt_in_running && !dev->buffer_overflow) { retval = usb_submit_urb(dev->interrupt_in_urb, GFP_ATOMIC); if (retval) { dev_err(&dev->intf->dev, "usb_submit_urb failed (%d)\n", retval); dev->buffer_overflow = 1; } } spin_unlock_irqrestore(&dev->rbsl, flags); exit: dev->interrupt_in_done = 1; wake_up_interruptible(&dev->read_wait); } /* * ld_usb_interrupt_out_callback */ static void ld_usb_interrupt_out_callback(struct urb *urb) { struct ld_usb *dev = urb->context; int status = urb->status; /* sync/async unlink faults aren't errors */ if (status && !(status == -ENOENT || status == -ECONNRESET || status == -ESHUTDOWN)) dev_dbg(&dev->intf->dev, "%s - nonzero write interrupt status received: %d\n", __func__, status); dev->interrupt_out_busy = 0; wake_up_interruptible(&dev->write_wait); } /* * ld_usb_open */ static int ld_usb_open(struct inode *inode, struct file *file) { struct ld_usb *dev; int subminor; int retval; struct usb_interface *interface; stream_open(inode, file); subminor = iminor(inode); interface = usb_find_interface(&ld_usb_driver, subminor); if (!interface) { printk(KERN_ERR "%s - error, can't find device for minor %d\n", __func__, subminor); return -ENODEV; } dev = usb_get_intfdata(interface); if (!dev) return -ENODEV; /* lock this device */ if (mutex_lock_interruptible(&dev->mutex)) return -ERESTARTSYS; /* allow opening only once */ if (dev->open_count) { retval = -EBUSY; goto unlock_exit; } dev->open_count = 1; /* initialize in direction */ dev->ring_head = 0; dev->ring_tail = 0; dev->buffer_overflow = 0; usb_fill_int_urb(dev->interrupt_in_urb, interface_to_usbdev(interface), usb_rcvintpipe(interface_to_usbdev(interface), dev->interrupt_in_endpoint->bEndpointAddress), dev->interrupt_in_buffer, dev->interrupt_in_endpoint_size, ld_usb_interrupt_in_callback, dev, dev->interrupt_in_interval); dev->interrupt_in_running = 1; dev->interrupt_in_done = 0; retval = usb_submit_urb(dev->interrupt_in_urb, GFP_KERNEL); if (retval) { dev_err(&interface->dev, "Couldn't submit interrupt_in_urb %d\n", retval); dev->interrupt_in_running = 0; dev->open_count = 0; goto unlock_exit; } /* save device in the file's private structure */ file->private_data = dev; unlock_exit: mutex_unlock(&dev->mutex); return retval; } /* * ld_usb_release */ static int ld_usb_release(struct inode *inode, struct file *file) { struct ld_usb *dev; int retval = 0; dev = file->private_data; if (dev == NULL) { retval = -ENODEV; goto exit; } mutex_lock(&dev->mutex); if (dev->open_count != 1) { retval = -ENODEV; goto unlock_exit; } if (dev->disconnected) { /* the device was unplugged before the file was released */ mutex_unlock(&dev->mutex); /* unlock here as ld_usb_delete frees dev */ ld_usb_delete(dev); goto exit; } /* wait until write transfer is finished */ if (dev->interrupt_out_busy) wait_event_interruptible_timeout(dev->write_wait, !dev->interrupt_out_busy, 2 * HZ); ld_usb_abort_transfers(dev); dev->open_count = 0; unlock_exit: mutex_unlock(&dev->mutex); exit: return retval; } /* * ld_usb_poll */ static __poll_t ld_usb_poll(struct file *file, poll_table *wait) { struct ld_usb *dev; __poll_t mask = 0; dev = file->private_data; if (dev->disconnected) return EPOLLERR | EPOLLHUP; poll_wait(file, &dev->read_wait, wait); poll_wait(file, &dev->write_wait, wait); if (dev->ring_head != dev->ring_tail) mask |= EPOLLIN | EPOLLRDNORM; if (!dev->interrupt_out_busy) mask |= EPOLLOUT | EPOLLWRNORM; return mask; } /* * ld_usb_read */ static ssize_t ld_usb_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct ld_usb *dev; size_t *actual_buffer; size_t bytes_to_read; int retval = 0; int rv; dev = file->private_data; /* verify that we actually have some data to read */ if (count == 0) goto exit; /* lock this object */ if (mutex_lock_interruptible(&dev->mutex)) { retval = -ERESTARTSYS; goto exit; } /* verify that the device wasn't unplugged */ if (dev->disconnected) { retval = -ENODEV; printk(KERN_ERR "ldusb: No device or device unplugged %d\n", retval); goto unlock_exit; } /* wait for data */ spin_lock_irq(&dev->rbsl); while (dev->ring_head == dev->ring_tail) { dev->interrupt_in_done = 0; spin_unlock_irq(&dev->rbsl); if (file->f_flags & O_NONBLOCK) { retval = -EAGAIN; goto unlock_exit; } retval = wait_event_interruptible(dev->read_wait, dev->interrupt_in_done); if (retval < 0) goto unlock_exit; spin_lock_irq(&dev->rbsl); } spin_unlock_irq(&dev->rbsl); /* actual_buffer contains actual_length + interrupt_in_buffer */ actual_buffer = (size_t *)(dev->ring_buffer + dev->ring_tail * (sizeof(size_t)+dev->interrupt_in_endpoint_size)); if (*actual_buffer > dev->interrupt_in_endpoint_size) { retval = -EIO; goto unlock_exit; } bytes_to_read = min(count, *actual_buffer); if (bytes_to_read < *actual_buffer) dev_warn(&dev->intf->dev, "Read buffer overflow, %zu bytes dropped\n", *actual_buffer-bytes_to_read); /* copy one interrupt_in_buffer from ring_buffer into userspace */ if (copy_to_user(buffer, actual_buffer+1, bytes_to_read)) { retval = -EFAULT; goto unlock_exit; } retval = bytes_to_read; spin_lock_irq(&dev->rbsl); dev->ring_tail = (dev->ring_tail + 1) % ring_buffer_size; if (dev->buffer_overflow) { dev->buffer_overflow = 0; spin_unlock_irq(&dev->rbsl); rv = usb_submit_urb(dev->interrupt_in_urb, GFP_KERNEL); if (rv < 0) dev->buffer_overflow = 1; } else { spin_unlock_irq(&dev->rbsl); } unlock_exit: /* unlock the device */ mutex_unlock(&dev->mutex); exit: return retval; } /* * ld_usb_write */ static ssize_t ld_usb_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct ld_usb *dev; size_t bytes_to_write; int retval = 0; dev = file->private_data; /* verify that we actually have some data to write */ if (count == 0) goto exit; /* lock this object */ if (mutex_lock_interruptible(&dev->mutex)) { retval = -ERESTARTSYS; goto exit; } /* verify that the device wasn't unplugged */ if (dev->disconnected) { retval = -ENODEV; printk(KERN_ERR "ldusb: No device or device unplugged %d\n", retval); goto unlock_exit; } /* wait until previous transfer is finished */ if (dev->interrupt_out_busy) { if (file->f_flags & O_NONBLOCK) { retval = -EAGAIN; goto unlock_exit; } retval = wait_event_interruptible(dev->write_wait, !dev->interrupt_out_busy); if (retval < 0) { goto unlock_exit; } } /* write the data into interrupt_out_buffer from userspace */ bytes_to_write = min(count, write_buffer_size*dev->interrupt_out_endpoint_size); if (bytes_to_write < count) dev_warn(&dev->intf->dev, "Write buffer overflow, %zu bytes dropped\n", count - bytes_to_write); dev_dbg(&dev->intf->dev, "%s: count = %zu, bytes_to_write = %zu\n", __func__, count, bytes_to_write); if (copy_from_user(dev->interrupt_out_buffer, buffer, bytes_to_write)) { retval = -EFAULT; goto unlock_exit; } if (dev->interrupt_out_endpoint == NULL) { /* try HID_REQ_SET_REPORT=9 on control_endpoint instead of interrupt_out_endpoint */ retval = usb_control_msg(interface_to_usbdev(dev->intf), usb_sndctrlpipe(interface_to_usbdev(dev->intf), 0), 9, USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_OUT, 1 << 8, 0, dev->interrupt_out_buffer, bytes_to_write, USB_CTRL_SET_TIMEOUT); if (retval < 0) dev_err(&dev->intf->dev, "Couldn't submit HID_REQ_SET_REPORT %d\n", retval); goto unlock_exit; } /* send off the urb */ usb_fill_int_urb(dev->interrupt_out_urb, interface_to_usbdev(dev->intf), usb_sndintpipe(interface_to_usbdev(dev->intf), dev->interrupt_out_endpoint->bEndpointAddress), dev->interrupt_out_buffer, bytes_to_write, ld_usb_interrupt_out_callback, dev, dev->interrupt_out_interval); dev->interrupt_out_busy = 1; wmb(); retval = usb_submit_urb(dev->interrupt_out_urb, GFP_KERNEL); if (retval) { dev->interrupt_out_busy = 0; dev_err(&dev->intf->dev, "Couldn't submit interrupt_out_urb %d\n", retval); goto unlock_exit; } retval = bytes_to_write; unlock_exit: /* unlock the device */ mutex_unlock(&dev->mutex); exit: return retval; } /* file operations needed when we register this driver */ static const struct file_operations ld_usb_fops = { .owner = THIS_MODULE, .read = ld_usb_read, .write = ld_usb_write, .open = ld_usb_open, .release = ld_usb_release, .poll = ld_usb_poll, .llseek = no_llseek, }; /* * usb class driver info in order to get a minor number from the usb core, * and to have the device registered with the driver core */ static struct usb_class_driver ld_usb_class = { .name = "ldusb%d", .fops = &ld_usb_fops, .minor_base = USB_LD_MINOR_BASE, }; /* * ld_usb_probe * * Called by the usb core when a new device is connected that it thinks * this driver might be interested in. */ static int ld_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct ld_usb *dev = NULL; struct usb_host_interface *iface_desc; char *buffer; int retval = -ENOMEM; int res; /* allocate memory for our device state and initialize it */ dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) goto exit; mutex_init(&dev->mutex); spin_lock_init(&dev->rbsl); dev->intf = intf; init_waitqueue_head(&dev->read_wait); init_waitqueue_head(&dev->write_wait); /* workaround for early firmware versions on fast computers */ if ((le16_to_cpu(udev->descriptor.idVendor) == USB_VENDOR_ID_LD) && ((le16_to_cpu(udev->descriptor.idProduct) == USB_DEVICE_ID_LD_CASSY) || (le16_to_cpu(udev->descriptor.idProduct) == USB_DEVICE_ID_LD_COM3LAB)) && (le16_to_cpu(udev->descriptor.bcdDevice) <= 0x103)) { buffer = kmalloc(256, GFP_KERNEL); if (!buffer) goto error; /* usb_string makes SETUP+STALL to leave always ControlReadLoop */ usb_string(udev, 255, buffer, 256); kfree(buffer); } iface_desc = intf->cur_altsetting; res = usb_find_last_int_in_endpoint(iface_desc, &dev->interrupt_in_endpoint); if (res) { dev_err(&intf->dev, "Interrupt in endpoint not found\n"); retval = res; goto error; } res = usb_find_last_int_out_endpoint(iface_desc, &dev->interrupt_out_endpoint); if (res) dev_warn(&intf->dev, "Interrupt out endpoint not found (using control endpoint instead)\n"); dev->interrupt_in_endpoint_size = usb_endpoint_maxp(dev->interrupt_in_endpoint); dev->ring_buffer = kcalloc(ring_buffer_size, sizeof(size_t) + dev->interrupt_in_endpoint_size, GFP_KERNEL); if (!dev->ring_buffer) goto error; dev->interrupt_in_buffer = kmalloc(dev->interrupt_in_endpoint_size, GFP_KERNEL); if (!dev->interrupt_in_buffer) goto error; dev->interrupt_in_urb = usb_alloc_urb(0, GFP_KERNEL); if (!dev->interrupt_in_urb) goto error; dev->interrupt_out_endpoint_size = dev->interrupt_out_endpoint ? usb_endpoint_maxp(dev->interrupt_out_endpoint) : udev->descriptor.bMaxPacketSize0; dev->interrupt_out_buffer = kmalloc_array(write_buffer_size, dev->interrupt_out_endpoint_size, GFP_KERNEL); if (!dev->interrupt_out_buffer) goto error; dev->interrupt_out_urb = usb_alloc_urb(0, GFP_KERNEL); if (!dev->interrupt_out_urb) goto error; dev->interrupt_in_interval = max_t(int, min_interrupt_in_interval, dev->interrupt_in_endpoint->bInterval); if (dev->interrupt_out_endpoint) dev->interrupt_out_interval = max_t(int, min_interrupt_out_interval, dev->interrupt_out_endpoint->bInterval); /* we can register the device now, as it is ready */ usb_set_intfdata(intf, dev); retval = usb_register_dev(intf, &ld_usb_class); if (retval) { /* something prevented us from registering this driver */ dev_err(&intf->dev, "Not able to get a minor for this device.\n"); usb_set_intfdata(intf, NULL); goto error; } /* let the user know what node this device is now attached to */ dev_info(&intf->dev, "LD USB Device #%d now attached to major %d minor %d\n", (intf->minor - USB_LD_MINOR_BASE), USB_MAJOR, intf->minor); exit: return retval; error: ld_usb_delete(dev); return retval; } /* * ld_usb_disconnect * * Called by the usb core when the device is removed from the system. */ static void ld_usb_disconnect(struct usb_interface *intf) { struct ld_usb *dev; int minor; dev = usb_get_intfdata(intf); usb_set_intfdata(intf, NULL); minor = intf->minor; /* give back our minor */ usb_deregister_dev(intf, &ld_usb_class); usb_poison_urb(dev->interrupt_in_urb); usb_poison_urb(dev->interrupt_out_urb); mutex_lock(&dev->mutex); /* if the device is not opened, then we clean up right now */ if (!dev->open_count) { mutex_unlock(&dev->mutex); ld_usb_delete(dev); } else { dev->disconnected = 1; /* wake up pollers */ wake_up_interruptible_all(&dev->read_wait); wake_up_interruptible_all(&dev->write_wait); mutex_unlock(&dev->mutex); } dev_info(&intf->dev, "LD USB Device #%d now disconnected\n", (minor - USB_LD_MINOR_BASE)); } /* usb specific object needed to register this driver with the usb subsystem */ static struct usb_driver ld_usb_driver = { .name = "ldusb", .probe = ld_usb_probe, .disconnect = ld_usb_disconnect, .id_table = ld_usb_table, }; module_usb_driver(ld_usb_driver);
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 10 10 10 10 10 10 10 10 10 10 10 2 8 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 // SPDX-License-Identifier: GPL-2.0 #include <linux/acpi.h> #include <linux/bitmap.h> #include <linux/compat.h> #include <linux/debugfs.h> #include <linux/device.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/idr.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/module.h> #include <linux/of.h> #include <linux/pinctrl/consumer.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/gpio.h> #include <linux/gpio/driver.h> #include <linux/gpio/machine.h> #include <uapi/linux/gpio.h> #include "gpiolib-acpi.h" #include "gpiolib-cdev.h" #include "gpiolib-of.h" #include "gpiolib-swnode.h" #include "gpiolib-sysfs.h" #include "gpiolib.h" #define CREATE_TRACE_POINTS #include <trace/events/gpio.h> /* Implementation infrastructure for GPIO interfaces. * * The GPIO programming interface allows for inlining speed-critical * get/set operations for common cases, so that access to SOC-integrated * GPIOs can sometimes cost only an instruction or two per bit. */ /* Device and char device-related information */ static DEFINE_IDA(gpio_ida); static dev_t gpio_devt; #define GPIO_DEV_MAX 256 /* 256 GPIO chip devices supported */ static int gpio_bus_match(struct device *dev, struct device_driver *drv) { struct fwnode_handle *fwnode = dev_fwnode(dev); /* * Only match if the fwnode doesn't already have a proper struct device * created for it. */ if (fwnode && fwnode->dev != dev) return 0; return 1; } static struct bus_type gpio_bus_type = { .name = "gpio", .match = gpio_bus_match, }; /* * Number of GPIOs to use for the fast path in set array */ #define FASTPATH_NGPIO CONFIG_GPIOLIB_FASTPATH_LIMIT /* gpio_lock prevents conflicts during gpio_desc[] table updates. * While any GPIO is requested, its gpio_chip is not removable; * each GPIO's "requested" flag serves as a lock and refcount. */ DEFINE_SPINLOCK(gpio_lock); static DEFINE_MUTEX(gpio_lookup_lock); static LIST_HEAD(gpio_lookup_list); LIST_HEAD(gpio_devices); static DEFINE_MUTEX(gpio_machine_hogs_mutex); static LIST_HEAD(gpio_machine_hogs); static void gpiochip_free_hogs(struct gpio_chip *gc); static int gpiochip_add_irqchip(struct gpio_chip *gc, struct lock_class_key *lock_key, struct lock_class_key *request_key); static void gpiochip_irqchip_remove(struct gpio_chip *gc); static int gpiochip_irqchip_init_hw(struct gpio_chip *gc); static int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gc); static void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gc); static bool gpiolib_initialized; static inline void desc_set_label(struct gpio_desc *d, const char *label) { d->label = label; } /** * gpio_to_desc - Convert a GPIO number to its descriptor * @gpio: global GPIO number * * Returns: * The GPIO descriptor associated with the given GPIO, or %NULL if no GPIO * with the given number exists in the system. */ struct gpio_desc *gpio_to_desc(unsigned gpio) { struct gpio_device *gdev; unsigned long flags; spin_lock_irqsave(&gpio_lock, flags); list_for_each_entry(gdev, &gpio_devices, list) { if (gdev->base <= gpio && gdev->base + gdev->ngpio > gpio) { spin_unlock_irqrestore(&gpio_lock, flags); return &gdev->descs[gpio - gdev->base]; } } spin_unlock_irqrestore(&gpio_lock, flags); if (!gpio_is_valid(gpio)) pr_warn("invalid GPIO %d\n", gpio); return NULL; } EXPORT_SYMBOL_GPL(gpio_to_desc); /* This function is deprecated and will be removed soon, don't use. */ struct gpio_desc *gpiochip_get_desc(struct gpio_chip *gc, unsigned int hwnum) { return gpio_device_get_desc(gc->gpiodev, hwnum); } EXPORT_SYMBOL_GPL(gpiochip_get_desc); /** * gpio_device_get_desc() - get the GPIO descriptor corresponding to the given * hardware number for this GPIO device * @gdev: GPIO device to get the descriptor from * @hwnum: hardware number of the GPIO for this chip * * Returns: * A pointer to the GPIO descriptor or %EINVAL if no GPIO exists in the given * chip for the specified hardware number or %ENODEV if the underlying chip * already vanished. * * The reference count of struct gpio_device is *NOT* increased like when the * GPIO is being requested for exclusive usage. It's up to the caller to make * sure the GPIO device will stay alive together with the descriptor returned * by this function. */ struct gpio_desc * gpio_device_get_desc(struct gpio_device *gdev, unsigned int hwnum) { struct gpio_chip *gc; /* * FIXME: This will be locked once we protect gdev->chip everywhere * with SRCU. */ gc = gdev->chip; if (!gc) return ERR_PTR(-ENODEV); if (hwnum >= gdev->ngpio) return ERR_PTR(-EINVAL); return &gdev->descs[hwnum]; } EXPORT_SYMBOL_GPL(gpio_device_get_desc); /** * desc_to_gpio - convert a GPIO descriptor to the integer namespace * @desc: GPIO descriptor * * This should disappear in the future but is needed since we still * use GPIO numbers for error messages and sysfs nodes. * * Returns: * The global GPIO number for the GPIO specified by its descriptor. */ int desc_to_gpio(const struct gpio_desc *desc) { return desc->gdev->base + (desc - &desc->gdev->descs[0]); } EXPORT_SYMBOL_GPL(desc_to_gpio); /** * gpiod_to_chip - Return the GPIO chip to which a GPIO descriptor belongs * @desc: descriptor to return the chip of */ struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc) { if (!desc || !desc->gdev) return NULL; return desc->gdev->chip; } EXPORT_SYMBOL_GPL(gpiod_to_chip); /** * gpiod_to_gpio_device() - Return the GPIO device to which this descriptor * belongs. * @desc: Descriptor for which to return the GPIO device. * * This *DOES NOT* increase the reference count of the GPIO device as it's * expected that the descriptor is requested and the users already holds a * reference to the device. * * Returns: * Address of the GPIO device owning this descriptor. */ struct gpio_device *gpiod_to_gpio_device(struct gpio_desc *desc) { if (!desc) return NULL; return desc->gdev; } EXPORT_SYMBOL_GPL(gpiod_to_gpio_device); /** * gpio_device_get_base() - Get the base GPIO number allocated by this device * @gdev: GPIO device * * Returns: * First GPIO number in the global GPIO numberspace for this device. */ int gpio_device_get_base(struct gpio_device *gdev) { return gdev->base; } EXPORT_SYMBOL_GPL(gpio_device_get_base); /** * gpio_device_get_label() - Get the label of this GPIO device * @gdev: GPIO device * * Returns: * Pointer to the string containing the GPIO device label. The string's * lifetime is tied to that of the underlying GPIO device. */ const char *gpio_device_get_label(struct gpio_device *gdev) { return gdev->label; } EXPORT_SYMBOL(gpio_device_get_label); /** * gpio_device_get_chip() - Get the gpio_chip implementation of this GPIO device * @gdev: GPIO device * * Returns: * Address of the GPIO chip backing this device. * * Until we can get rid of all non-driver users of struct gpio_chip, we must * provide a way of retrieving the pointer to it from struct gpio_device. This * is *NOT* safe as the GPIO API is considered to be hot-unpluggable and the * chip can dissapear at any moment (unlike reference-counted struct * gpio_device). * * Use at your own risk. */ struct gpio_chip *gpio_device_get_chip(struct gpio_device *gdev) { return gdev->chip; } EXPORT_SYMBOL_GPL(gpio_device_get_chip); /* dynamic allocation of GPIOs, e.g. on a hotplugged device */ static int gpiochip_find_base_unlocked(int ngpio) { struct gpio_device *gdev; int base = GPIO_DYNAMIC_BASE; list_for_each_entry(gdev, &gpio_devices, list) { /* found a free space? */ if (gdev->base >= base + ngpio) break; /* nope, check the space right after the chip */ base = gdev->base + gdev->ngpio; if (base < GPIO_DYNAMIC_BASE) base = GPIO_DYNAMIC_BASE; } if (gpio_is_valid(base)) { pr_debug("%s: found new base at %d\n", __func__, base); return base; } else { pr_err("%s: cannot find free range\n", __func__); return -ENOSPC; } } /** * gpiod_get_direction - return the current direction of a GPIO * @desc: GPIO to get the direction of * * Returns 0 for output, 1 for input, or an error code in case of error. * * This function may sleep if gpiod_cansleep() is true. */ int gpiod_get_direction(struct gpio_desc *desc) { struct gpio_chip *gc; unsigned int offset; int ret; gc = gpiod_to_chip(desc); offset = gpio_chip_hwgpio(desc); /* * Open drain emulation using input mode may incorrectly report * input here, fix that up. */ if (test_bit(FLAG_OPEN_DRAIN, &desc->flags) && test_bit(FLAG_IS_OUT, &desc->flags)) return 0; if (!gc->get_direction) return -ENOTSUPP; ret = gc->get_direction(gc, offset); if (ret < 0) return ret; /* GPIOF_DIR_IN or other positive, otherwise GPIOF_DIR_OUT */ if (ret > 0) ret = 1; assign_bit(FLAG_IS_OUT, &desc->flags, !ret); return ret; } EXPORT_SYMBOL_GPL(gpiod_get_direction); /* * Add a new chip to the global chips list, keeping the list of chips sorted * by range(means [base, base + ngpio - 1]) order. * * Return -EBUSY if the new chip overlaps with some other chip's integer * space. */ static int gpiodev_add_to_list_unlocked(struct gpio_device *gdev) { struct gpio_device *prev, *next; if (list_empty(&gpio_devices)) { /* initial entry in list */ list_add_tail(&gdev->list, &gpio_devices); return 0; } next = list_first_entry(&gpio_devices, struct gpio_device, list); if (gdev->base + gdev->ngpio <= next->base) { /* add before first entry */ list_add(&gdev->list, &gpio_devices); return 0; } prev = list_last_entry(&gpio_devices, struct gpio_device, list); if (prev->base + prev->ngpio <= gdev->base) { /* add behind last entry */ list_add_tail(&gdev->list, &gpio_devices); return 0; } list_for_each_entry_safe(prev, next, &gpio_devices, list) { /* at the end of the list */ if (&next->list == &gpio_devices) break; /* add between prev and next */ if (prev->base + prev->ngpio <= gdev->base && gdev->base + gdev->ngpio <= next->base) { list_add(&gdev->list, &prev->list); return 0; } } return -EBUSY; } /* * Convert a GPIO name to its descriptor * Note that there is no guarantee that GPIO names are globally unique! * Hence this function will return, if it exists, a reference to the first GPIO * line found that matches the given name. */ static struct gpio_desc *gpio_name_to_desc(const char * const name) { struct gpio_device *gdev; unsigned long flags; if (!name) return NULL; spin_lock_irqsave(&gpio_lock, flags); list_for_each_entry(gdev, &gpio_devices, list) { struct gpio_desc *desc; for_each_gpio_desc(gdev->chip, desc) { if (desc->name && !strcmp(desc->name, name)) { spin_unlock_irqrestore(&gpio_lock, flags); return desc; } } } spin_unlock_irqrestore(&gpio_lock, flags); return NULL; } /* * Take the names from gc->names and assign them to their GPIO descriptors. * Warn if a name is already used for a GPIO line on a different GPIO chip. * * Note that: * 1. Non-unique names are still accepted, * 2. Name collisions within the same GPIO chip are not reported. */ static int gpiochip_set_desc_names(struct gpio_chip *gc) { struct gpio_device *gdev = gc->gpiodev; int i; /* First check all names if they are unique */ for (i = 0; i != gc->ngpio; ++i) { struct gpio_desc *gpio; gpio = gpio_name_to_desc(gc->names[i]); if (gpio) dev_warn(&gdev->dev, "Detected name collision for GPIO name '%s'\n", gc->names[i]); } /* Then add all names to the GPIO descriptors */ for (i = 0; i != gc->ngpio; ++i) gdev->descs[i].name = gc->names[i]; return 0; } /* * gpiochip_set_names - Set GPIO line names using device properties * @chip: GPIO chip whose lines should be named, if possible * * Looks for device property "gpio-line-names" and if it exists assigns * GPIO line names for the chip. The memory allocated for the assigned * names belong to the underlying firmware node and should not be released * by the caller. */ static int gpiochip_set_names(struct gpio_chip *chip) { struct gpio_device *gdev = chip->gpiodev; struct device *dev = &gdev->dev; const char **names; int ret, i; int count; count = device_property_string_array_count(dev, "gpio-line-names"); if (count < 0) return 0; /* * When offset is set in the driver side we assume the driver internally * is using more than one gpiochip per the same device. We have to stop * setting friendly names if the specified ones with 'gpio-line-names' * are less than the offset in the device itself. This means all the * lines are not present for every single pin within all the internal * gpiochips. */ if (count <= chip->offset) { dev_warn(dev, "gpio-line-names too short (length %d), cannot map names for the gpiochip at offset %u\n", count, chip->offset); return 0; } names = kcalloc(count, sizeof(*names), GFP_KERNEL); if (!names) return -ENOMEM; ret = device_property_read_string_array(dev, "gpio-line-names", names, count); if (ret < 0) { dev_warn(dev, "failed to read GPIO line names\n"); kfree(names); return ret; } /* * When more that one gpiochip per device is used, 'count' can * contain at most number gpiochips x chip->ngpio. We have to * correctly distribute all defined lines taking into account * chip->offset as starting point from where we will assign * the names to pins from the 'names' array. Since property * 'gpio-line-names' cannot contains gaps, we have to be sure * we only assign those pins that really exists since chip->ngpio * can be different of the chip->offset. */ count = (count > chip->offset) ? count - chip->offset : count; if (count > chip->ngpio) count = chip->ngpio; for (i = 0; i < count; i++) { /* * Allow overriding "fixed" names provided by the GPIO * provider. The "fixed" names are more often than not * generic and less informative than the names given in * device properties. */ if (names[chip->offset + i] && names[chip->offset + i][0]) gdev->descs[i].name = names[chip->offset + i]; } kfree(names); return 0; } static unsigned long *gpiochip_allocate_mask(struct gpio_chip *gc) { unsigned long *p; p = bitmap_alloc(gc->ngpio, GFP_KERNEL); if (!p) return NULL; /* Assume by default all GPIOs are valid */ bitmap_fill(p, gc->ngpio); return p; } static void gpiochip_free_mask(unsigned long **p) { bitmap_free(*p); *p = NULL; } static unsigned int gpiochip_count_reserved_ranges(struct gpio_chip *gc) { struct device *dev = &gc->gpiodev->dev; int size; /* Format is "start, count, ..." */ size = device_property_count_u32(dev, "gpio-reserved-ranges"); if (size > 0 && size % 2 == 0) return size; return 0; } static int gpiochip_apply_reserved_ranges(struct gpio_chip *gc) { struct device *dev = &gc->gpiodev->dev; unsigned int size; u32 *ranges; int ret; size = gpiochip_count_reserved_ranges(gc); if (size == 0) return 0; ranges = kmalloc_array(size, sizeof(*ranges), GFP_KERNEL); if (!ranges) return -ENOMEM; ret = device_property_read_u32_array(dev, "gpio-reserved-ranges", ranges, size); if (ret) { kfree(ranges); return ret; } while (size) { u32 count = ranges[--size]; u32 start = ranges[--size]; if (start >= gc->ngpio || start + count > gc->ngpio) continue; bitmap_clear(gc->valid_mask, start, count); } kfree(ranges); return 0; } static int gpiochip_init_valid_mask(struct gpio_chip *gc) { int ret; if (!(gpiochip_count_reserved_ranges(gc) || gc->init_valid_mask)) return 0; gc->valid_mask = gpiochip_allocate_mask(gc); if (!gc->valid_mask) return -ENOMEM; ret = gpiochip_apply_reserved_ranges(gc); if (ret) return ret; if (gc->init_valid_mask) return gc->init_valid_mask(gc, gc->valid_mask, gc->ngpio); return 0; } static void gpiochip_free_valid_mask(struct gpio_chip *gc) { gpiochip_free_mask(&gc->valid_mask); } static int gpiochip_add_pin_ranges(struct gpio_chip *gc) { /* * Device Tree platforms are supposed to use "gpio-ranges" * property. This check ensures that the ->add_pin_ranges() * won't be called for them. */ if (device_property_present(&gc->gpiodev->dev, "gpio-ranges")) return 0; if (gc->add_pin_ranges) return gc->add_pin_ranges(gc); return 0; } bool gpiochip_line_is_valid(const struct gpio_chip *gc, unsigned int offset) { /* No mask means all valid */ if (likely(!gc->valid_mask)) return true; return test_bit(offset, gc->valid_mask); } EXPORT_SYMBOL_GPL(gpiochip_line_is_valid); static void gpiodev_release(struct device *dev) { struct gpio_device *gdev = to_gpio_device(dev); ida_free(&gpio_ida, gdev->id); kfree_const(gdev->label); kfree(gdev->descs); kfree(gdev); } #ifdef CONFIG_GPIO_CDEV #define gcdev_register(gdev, devt) gpiolib_cdev_register((gdev), (devt)) #define gcdev_unregister(gdev) gpiolib_cdev_unregister((gdev)) #else /* * gpiolib_cdev_register() indirectly calls device_add(), which is still * required even when cdev is not selected. */ #define gcdev_register(gdev, devt) device_add(&(gdev)->dev) #define gcdev_unregister(gdev) device_del(&(gdev)->dev) #endif static int gpiochip_setup_dev(struct gpio_device *gdev) { struct fwnode_handle *fwnode = dev_fwnode(&gdev->dev); int ret; /* * If fwnode doesn't belong to another device, it's safe to clear its * initialized flag. */ if (fwnode && !fwnode->dev) fwnode_dev_initialized(fwnode, false); ret = gcdev_register(gdev, gpio_devt); if (ret) return ret; /* From this point, the .release() function cleans up gpio_device */ gdev->dev.release = gpiodev_release; ret = gpiochip_sysfs_register(gdev); if (ret) goto err_remove_device; dev_dbg(&gdev->dev, "registered GPIOs %d to %d on %s\n", gdev->base, gdev->base + gdev->ngpio - 1, gdev->chip->label ? : "generic"); return 0; err_remove_device: gcdev_unregister(gdev); return ret; } static void gpiochip_machine_hog(struct gpio_chip *gc, struct gpiod_hog *hog) { struct gpio_desc *desc; int rv; desc = gpiochip_get_desc(gc, hog->chip_hwnum); if (IS_ERR(desc)) { chip_err(gc, "%s: unable to get GPIO desc: %ld\n", __func__, PTR_ERR(desc)); return; } if (test_bit(FLAG_IS_HOGGED, &desc->flags)) return; rv = gpiod_hog(desc, hog->line_name, hog->lflags, hog->dflags); if (rv) gpiod_err(desc, "%s: unable to hog GPIO line (%s:%u): %d\n", __func__, gc->label, hog->chip_hwnum, rv); } static void machine_gpiochip_add(struct gpio_chip *gc) { struct gpiod_hog *hog; mutex_lock(&gpio_machine_hogs_mutex); list_for_each_entry(hog, &gpio_machine_hogs, list) { if (!strcmp(gc->label, hog->chip_label)) gpiochip_machine_hog(gc, hog); } mutex_unlock(&gpio_machine_hogs_mutex); } static void gpiochip_setup_devs(void) { struct gpio_device *gdev; int ret; list_for_each_entry(gdev, &gpio_devices, list) { ret = gpiochip_setup_dev(gdev); if (ret) dev_err(&gdev->dev, "Failed to initialize gpio device (%d)\n", ret); } } static void gpiochip_set_data(struct gpio_chip *gc, void *data) { gc->gpiodev->data = data; } /** * gpiochip_get_data() - get per-subdriver data for the chip * @gc: GPIO chip * * Returns: * The per-subdriver data for the chip. */ void *gpiochip_get_data(struct gpio_chip *gc) { return gc->gpiodev->data; } EXPORT_SYMBOL_GPL(gpiochip_get_data); int gpiochip_get_ngpios(struct gpio_chip *gc, struct device *dev) { u32 ngpios = gc->ngpio; int ret; if (ngpios == 0) { ret = device_property_read_u32(dev, "ngpios", &ngpios); if (ret == -ENODATA) /* * -ENODATA means that there is no property found and * we want to issue the error message to the user. * Besides that, we want to return different error code * to state that supplied value is not valid. */ ngpios = 0; else if (ret) return ret; gc->ngpio = ngpios; } if (gc->ngpio == 0) { chip_err(gc, "tried to insert a GPIO chip with zero lines\n"); return -EINVAL; } if (gc->ngpio > FASTPATH_NGPIO) chip_warn(gc, "line cnt %u is greater than fast path cnt %u\n", gc->ngpio, FASTPATH_NGPIO); return 0; } EXPORT_SYMBOL_GPL(gpiochip_get_ngpios); int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, struct lock_class_key *lock_key, struct lock_class_key *request_key) { struct gpio_device *gdev; unsigned long flags; unsigned int i; int base = 0; int ret = 0; /* * First: allocate and populate the internal stat container, and * set up the struct device. */ gdev = kzalloc(sizeof(*gdev), GFP_KERNEL); if (!gdev) return -ENOMEM; gdev->dev.bus = &gpio_bus_type; gdev->dev.parent = gc->parent; gdev->chip = gc; gc->gpiodev = gdev; gpiochip_set_data(gc, data); /* * If the calling driver did not initialize firmware node, * do it here using the parent device, if any. */ if (gc->fwnode) device_set_node(&gdev->dev, gc->fwnode); else if (gc->parent) device_set_node(&gdev->dev, dev_fwnode(gc->parent)); gdev->id = ida_alloc(&gpio_ida, GFP_KERNEL); if (gdev->id < 0) { ret = gdev->id; goto err_free_gdev; } ret = dev_set_name(&gdev->dev, GPIOCHIP_NAME "%d", gdev->id); if (ret) goto err_free_ida; device_initialize(&gdev->dev); if (gc->parent && gc->parent->driver) gdev->owner = gc->parent->driver->owner; else if (gc->owner) /* TODO: remove chip->owner */ gdev->owner = gc->owner; else gdev->owner = THIS_MODULE; ret = gpiochip_get_ngpios(gc, &gdev->dev); if (ret) goto err_free_dev_name; gdev->descs = kcalloc(gc->ngpio, sizeof(*gdev->descs), GFP_KERNEL); if (!gdev->descs) { ret = -ENOMEM; goto err_free_dev_name; } gdev->label = kstrdup_const(gc->label ?: "unknown", GFP_KERNEL); if (!gdev->label) { ret = -ENOMEM; goto err_free_descs; } gdev->ngpio = gc->ngpio; spin_lock_irqsave(&gpio_lock, flags); /* * TODO: this allocates a Linux GPIO number base in the global * GPIO numberspace for this chip. In the long run we want to * get *rid* of this numberspace and use only descriptors, but * it may be a pipe dream. It will not happen before we get rid * of the sysfs interface anyways. */ base = gc->base; if (base < 0) { base = gpiochip_find_base_unlocked(gc->ngpio); if (base < 0) { spin_unlock_irqrestore(&gpio_lock, flags); ret = base; base = 0; goto err_free_label; } /* * TODO: it should not be necessary to reflect the assigned * base outside of the GPIO subsystem. Go over drivers and * see if anyone makes use of this, else drop this and assign * a poison instead. */ gc->base = base; } else { dev_warn(&gdev->dev, "Static allocation of GPIO base is deprecated, use dynamic allocation.\n"); } gdev->base = base; ret = gpiodev_add_to_list_unlocked(gdev); if (ret) { spin_unlock_irqrestore(&gpio_lock, flags); chip_err(gc, "GPIO integer space overlap, cannot add chip\n"); goto err_free_label; } for (i = 0; i < gc->ngpio; i++) gdev->descs[i].gdev = gdev; spin_unlock_irqrestore(&gpio_lock, flags); BLOCKING_INIT_NOTIFIER_HEAD(&gdev->line_state_notifier); BLOCKING_INIT_NOTIFIER_HEAD(&gdev->device_notifier); init_rwsem(&gdev->sem); #ifdef CONFIG_PINCTRL INIT_LIST_HEAD(&gdev->pin_ranges); #endif if (gc->names) { ret = gpiochip_set_desc_names(gc); if (ret) goto err_remove_from_list; } ret = gpiochip_set_names(gc); if (ret) goto err_remove_from_list; ret = gpiochip_init_valid_mask(gc); if (ret) goto err_remove_from_list; ret = of_gpiochip_add(gc); if (ret) goto err_free_gpiochip_mask; for (i = 0; i < gc->ngpio; i++) { struct gpio_desc *desc = &gdev->descs[i]; if (gc->get_direction && gpiochip_line_is_valid(gc, i)) { assign_bit(FLAG_IS_OUT, &desc->flags, !gc->get_direction(gc, i)); } else { assign_bit(FLAG_IS_OUT, &desc->flags, !gc->direction_input); } } ret = gpiochip_add_pin_ranges(gc); if (ret) goto err_remove_of_chip; acpi_gpiochip_add(gc); machine_gpiochip_add(gc); ret = gpiochip_irqchip_init_valid_mask(gc); if (ret) goto err_remove_acpi_chip; ret = gpiochip_irqchip_init_hw(gc); if (ret) goto err_remove_acpi_chip; ret = gpiochip_add_irqchip(gc, lock_key, request_key); if (ret) goto err_remove_irqchip_mask; /* * By first adding the chardev, and then adding the device, * we get a device node entry in sysfs under * /sys/bus/gpio/devices/gpiochipN/dev that can be used for * coldplug of device nodes and other udev business. * We can do this only if gpiolib has been initialized. * Otherwise, defer until later. */ if (gpiolib_initialized) { ret = gpiochip_setup_dev(gdev); if (ret) goto err_remove_irqchip; } return 0; err_remove_irqchip: gpiochip_irqchip_remove(gc); err_remove_irqchip_mask: gpiochip_irqchip_free_valid_mask(gc); err_remove_acpi_chip: acpi_gpiochip_remove(gc); err_remove_of_chip: gpiochip_free_hogs(gc); of_gpiochip_remove(gc); err_free_gpiochip_mask: gpiochip_remove_pin_ranges(gc); gpiochip_free_valid_mask(gc); err_remove_from_list: spin_lock_irqsave(&gpio_lock, flags); list_del(&gdev->list); spin_unlock_irqrestore(&gpio_lock, flags); if (gdev->dev.release) { /* release() has been registered by gpiochip_setup_dev() */ gpio_device_put(gdev); goto err_print_message; } err_free_label: kfree_const(gdev->label); err_free_descs: kfree(gdev->descs); err_free_dev_name: kfree(dev_name(&gdev->dev)); err_free_ida: ida_free(&gpio_ida, gdev->id); err_free_gdev: kfree(gdev); err_print_message: /* failures here can mean systems won't boot... */ if (ret != -EPROBE_DEFER) { pr_err("%s: GPIOs %d..%d (%s) failed to register, %d\n", __func__, base, base + (int)gc->ngpio - 1, gc->label ? : "generic", ret); } return ret; } EXPORT_SYMBOL_GPL(gpiochip_add_data_with_key); /** * gpiochip_remove() - unregister a gpio_chip * @gc: the chip to unregister * * A gpio_chip with any GPIOs still requested may not be removed. */ void gpiochip_remove(struct gpio_chip *gc) { struct gpio_device *gdev = gc->gpiodev; unsigned long flags; unsigned int i; down_write(&gdev->sem); /* FIXME: should the legacy sysfs handling be moved to gpio_device? */ gpiochip_sysfs_unregister(gdev); gpiochip_free_hogs(gc); /* Numb the device, cancelling all outstanding operations */ gdev->chip = NULL; gpiochip_irqchip_remove(gc); acpi_gpiochip_remove(gc); of_gpiochip_remove(gc); gpiochip_remove_pin_ranges(gc); gpiochip_free_valid_mask(gc); /* * We accept no more calls into the driver from this point, so * NULL the driver data pointer. */ gpiochip_set_data(gc, NULL); spin_lock_irqsave(&gpio_lock, flags); for (i = 0; i < gdev->ngpio; i++) { if (test_bit(FLAG_REQUESTED, &gdev->descs[i].flags)) break; } spin_unlock_irqrestore(&gpio_lock, flags); if (i != gdev->ngpio) dev_crit(&gdev->dev, "REMOVING GPIOCHIP WITH GPIOS STILL REQUESTED\n"); scoped_guard(spinlock_irqsave, &gpio_lock) list_del(&gdev->list); /* * The gpiochip side puts its use of the device to rest here: * if there are no userspace clients, the chardev and device will * be removed, else it will be dangling until the last user is * gone. */ gcdev_unregister(gdev); up_write(&gdev->sem); gpio_device_put(gdev); } EXPORT_SYMBOL_GPL(gpiochip_remove); /** * gpio_device_find() - find a specific GPIO device * @data: data to pass to match function * @match: Callback function to check gpio_chip * * Returns: * New reference to struct gpio_device. * * Similar to bus_find_device(). It returns a reference to a gpio_device as * determined by a user supplied @match callback. The callback should return * 0 if the device doesn't match and non-zero if it does. If the callback * returns non-zero, this function will return to the caller and not iterate * over any more gpio_devices. * * The callback takes the GPIO chip structure as argument. During the execution * of the callback function the chip is protected from being freed. TODO: This * actually has yet to be implemented. * * If the function returns non-NULL, the returned reference must be freed by * the caller using gpio_device_put(). */ struct gpio_device *gpio_device_find(void *data, int (*match)(struct gpio_chip *gc, void *data)) { struct gpio_device *gdev; /* * Not yet but in the future the spinlock below will become a mutex. * Annotate this function before anyone tries to use it in interrupt * context like it happened with gpiochip_find(). */ might_sleep(); guard(spinlock_irqsave)(&gpio_lock); list_for_each_entry(gdev, &gpio_devices, list) { if (gdev->chip && match(gdev->chip, data)) return gpio_device_get(gdev); } return NULL; } EXPORT_SYMBOL_GPL(gpio_device_find); static int gpio_chip_match_by_label(struct gpio_chip *gc, void *label) { return gc->label && !strcmp(gc->label, label); } /** * gpio_device_find_by_label() - wrapper around gpio_device_find() finding the * GPIO device by its backing chip's label * @label: Label to lookup * * Returns: * Reference to the GPIO device or NULL. Reference must be released with * gpio_device_put(). */ struct gpio_device *gpio_device_find_by_label(const char *label) { return gpio_device_find((void *)label, gpio_chip_match_by_label); } EXPORT_SYMBOL_GPL(gpio_device_find_by_label); static int gpio_chip_match_by_fwnode(struct gpio_chip *gc, void *fwnode) { return device_match_fwnode(&gc->gpiodev->dev, fwnode); } /** * gpio_device_find_by_fwnode() - wrapper around gpio_device_find() finding * the GPIO device by its fwnode * @fwnode: Firmware node to lookup * * Returns: * Reference to the GPIO device or NULL. Reference must be released with * gpio_device_put(). */ struct gpio_device *gpio_device_find_by_fwnode(const struct fwnode_handle *fwnode) { return gpio_device_find((void *)fwnode, gpio_chip_match_by_fwnode); } EXPORT_SYMBOL_GPL(gpio_device_find_by_fwnode); /** * gpio_device_get() - Increase the reference count of this GPIO device * @gdev: GPIO device to increase the refcount for * * Returns: * Pointer to @gdev. */ struct gpio_device *gpio_device_get(struct gpio_device *gdev) { return to_gpio_device(get_device(&gdev->dev)); } EXPORT_SYMBOL_GPL(gpio_device_get); /** * gpio_device_put() - Decrease the reference count of this GPIO device and * possibly free all resources associated with it. * @gdev: GPIO device to decrease the reference count for */ void gpio_device_put(struct gpio_device *gdev) { put_device(&gdev->dev); } EXPORT_SYMBOL_GPL(gpio_device_put); /** * gpio_device_to_device() - Retrieve the address of the underlying struct * device. * @gdev: GPIO device for which to return the address. * * This does not increase the reference count of the GPIO device nor the * underlying struct device. * * Returns: * Address of struct device backing this GPIO device. */ struct device *gpio_device_to_device(struct gpio_device *gdev) { return &gdev->dev; } EXPORT_SYMBOL_GPL(gpio_device_to_device); #ifdef CONFIG_GPIOLIB_IRQCHIP /* * The following is irqchip helper code for gpiochips. */ static int gpiochip_irqchip_init_hw(struct gpio_chip *gc) { struct gpio_irq_chip *girq = &gc->irq; if (!girq->init_hw) return 0; return girq->init_hw(gc); } static int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gc) { struct gpio_irq_chip *girq = &gc->irq; if (!girq->init_valid_mask) return 0; girq->valid_mask = gpiochip_allocate_mask(gc); if (!girq->valid_mask) return -ENOMEM; girq->init_valid_mask(gc, girq->valid_mask, gc->ngpio); return 0; } static void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gc) { gpiochip_free_mask(&gc->irq.valid_mask); } bool gpiochip_irqchip_irq_valid(const struct gpio_chip *gc, unsigned int offset) { if (!gpiochip_line_is_valid(gc, offset)) return false; /* No mask means all valid */ if (likely(!gc->irq.valid_mask)) return true; return test_bit(offset, gc->irq.valid_mask); } EXPORT_SYMBOL_GPL(gpiochip_irqchip_irq_valid); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /** * gpiochip_set_hierarchical_irqchip() - connects a hierarchical irqchip * to a gpiochip * @gc: the gpiochip to set the irqchip hierarchical handler to * @irqchip: the irqchip to handle this level of the hierarchy, the interrupt * will then percolate up to the parent */ static void gpiochip_set_hierarchical_irqchip(struct gpio_chip *gc, struct irq_chip *irqchip) { /* DT will deal with mapping each IRQ as we go along */ if (is_of_node(gc->irq.fwnode)) return; /* * This is for legacy and boardfile "irqchip" fwnodes: allocate * irqs upfront instead of dynamically since we don't have the * dynamic type of allocation that hardware description languages * provide. Once all GPIO drivers using board files are gone from * the kernel we can delete this code, but for a transitional period * it is necessary to keep this around. */ if (is_fwnode_irqchip(gc->irq.fwnode)) { int i; int ret; for (i = 0; i < gc->ngpio; i++) { struct irq_fwspec fwspec; unsigned int parent_hwirq; unsigned int parent_type; struct gpio_irq_chip *girq = &gc->irq; /* * We call the child to parent translation function * only to check if the child IRQ is valid or not. * Just pick the rising edge type here as that is what * we likely need to support. */ ret = girq->child_to_parent_hwirq(gc, i, IRQ_TYPE_EDGE_RISING, &parent_hwirq, &parent_type); if (ret) { chip_err(gc, "skip set-up on hwirq %d\n", i); continue; } fwspec.fwnode = gc->irq.fwnode; /* This is the hwirq for the GPIO line side of things */ fwspec.param[0] = girq->child_offset_to_irq(gc, i); /* Just pick something */ fwspec.param[1] = IRQ_TYPE_EDGE_RISING; fwspec.param_count = 2; ret = irq_domain_alloc_irqs(gc->irq.domain, 1, NUMA_NO_NODE, &fwspec); if (ret < 0) { chip_err(gc, "can not allocate irq for GPIO line %d parent hwirq %d in hierarchy domain: %d\n", i, parent_hwirq, ret); } } } chip_err(gc, "%s unknown fwnode type proceed anyway\n", __func__); return; } static int gpiochip_hierarchy_irq_domain_translate(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *hwirq, unsigned int *type) { /* We support standard DT translation */ if (is_of_node(fwspec->fwnode) && fwspec->param_count == 2) { return irq_domain_translate_twocell(d, fwspec, hwirq, type); } /* This is for board files and others not using DT */ if (is_fwnode_irqchip(fwspec->fwnode)) { int ret; ret = irq_domain_translate_twocell(d, fwspec, hwirq, type); if (ret) return ret; WARN_ON(*type == IRQ_TYPE_NONE); return 0; } return -EINVAL; } static int gpiochip_hierarchy_irq_domain_alloc(struct irq_domain *d, unsigned int irq, unsigned int nr_irqs, void *data) { struct gpio_chip *gc = d->host_data; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; struct irq_fwspec *fwspec = data; union gpio_irq_fwspec gpio_parent_fwspec = {}; unsigned int parent_hwirq; unsigned int parent_type; struct gpio_irq_chip *girq = &gc->irq; int ret; /* * The nr_irqs parameter is always one except for PCI multi-MSI * so this should not happen. */ WARN_ON(nr_irqs != 1); ret = gc->irq.child_irq_domain_ops.translate(d, fwspec, &hwirq, &type); if (ret) return ret; chip_dbg(gc, "allocate IRQ %d, hwirq %lu\n", irq, hwirq); ret = girq->child_to_parent_hwirq(gc, hwirq, type, &parent_hwirq, &parent_type); if (ret) { chip_err(gc, "can't look up hwirq %lu\n", hwirq); return ret; } chip_dbg(gc, "found parent hwirq %u\n", parent_hwirq); /* * We set handle_bad_irq because the .set_type() should * always be invoked and set the right type of handler. */ irq_domain_set_info(d, irq, hwirq, gc->irq.chip, gc, girq->handler, NULL, NULL); irq_set_probe(irq); /* This parent only handles asserted level IRQs */ ret = girq->populate_parent_alloc_arg(gc, &gpio_parent_fwspec, parent_hwirq, parent_type); if (ret) return ret; chip_dbg(gc, "alloc_irqs_parent for %d parent hwirq %d\n", irq, parent_hwirq); irq_set_lockdep_class(irq, gc->irq.lock_key, gc->irq.request_key); ret = irq_domain_alloc_irqs_parent(d, irq, 1, &gpio_parent_fwspec); /* * If the parent irqdomain is msi, the interrupts have already * been allocated, so the EEXIST is good. */ if (irq_domain_is_msi(d->parent) && (ret == -EEXIST)) ret = 0; if (ret) chip_err(gc, "failed to allocate parent hwirq %d for hwirq %lu\n", parent_hwirq, hwirq); return ret; } static unsigned int gpiochip_child_offset_to_irq_noop(struct gpio_chip *gc, unsigned int offset) { return offset; } static void gpiochip_hierarchy_setup_domain_ops(struct irq_domain_ops *ops) { ops->activate = gpiochip_irq_domain_activate; ops->deactivate = gpiochip_irq_domain_deactivate; ops->alloc = gpiochip_hierarchy_irq_domain_alloc; /* * We only allow overriding the translate() and free() functions for * hierarchical chips, and this should only be done if the user * really need something other than 1:1 translation for translate() * callback and free if user wants to free up any resources which * were allocated during callbacks, for example populate_parent_alloc_arg. */ if (!ops->translate) ops->translate = gpiochip_hierarchy_irq_domain_translate; if (!ops->free) ops->free = irq_domain_free_irqs_common; } static struct irq_domain *gpiochip_hierarchy_create_domain(struct gpio_chip *gc) { struct irq_domain *domain; if (!gc->irq.child_to_parent_hwirq || !gc->irq.fwnode) { chip_err(gc, "missing irqdomain vital data\n"); return ERR_PTR(-EINVAL); } if (!gc->irq.child_offset_to_irq) gc->irq.child_offset_to_irq = gpiochip_child_offset_to_irq_noop; if (!gc->irq.populate_parent_alloc_arg) gc->irq.populate_parent_alloc_arg = gpiochip_populate_parent_fwspec_twocell; gpiochip_hierarchy_setup_domain_ops(&gc->irq.child_irq_domain_ops); domain = irq_domain_create_hierarchy( gc->irq.parent_domain, 0, gc->ngpio, gc->irq.fwnode, &gc->irq.child_irq_domain_ops, gc); if (!domain) return ERR_PTR(-ENOMEM); gpiochip_set_hierarchical_irqchip(gc, gc->irq.chip); return domain; } static bool gpiochip_hierarchy_is_hierarchical(struct gpio_chip *gc) { return !!gc->irq.parent_domain; } int gpiochip_populate_parent_fwspec_twocell(struct gpio_chip *gc, union gpio_irq_fwspec *gfwspec, unsigned int parent_hwirq, unsigned int parent_type) { struct irq_fwspec *fwspec = &gfwspec->fwspec; fwspec->fwnode = gc->irq.parent_domain->fwnode; fwspec->param_count = 2; fwspec->param[0] = parent_hwirq; fwspec->param[1] = parent_type; return 0; } EXPORT_SYMBOL_GPL(gpiochip_populate_parent_fwspec_twocell); int gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc, union gpio_irq_fwspec *gfwspec, unsigned int parent_hwirq, unsigned int parent_type) { struct irq_fwspec *fwspec = &gfwspec->fwspec; fwspec->fwnode = gc->irq.parent_domain->fwnode; fwspec->param_count = 4; fwspec->param[0] = 0; fwspec->param[1] = parent_hwirq; fwspec->param[2] = 0; fwspec->param[3] = parent_type; return 0; } EXPORT_SYMBOL_GPL(gpiochip_populate_parent_fwspec_fourcell); #else static struct irq_domain *gpiochip_hierarchy_create_domain(struct gpio_chip *gc) { return ERR_PTR(-EINVAL); } static bool gpiochip_hierarchy_is_hierarchical(struct gpio_chip *gc) { return false; } #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ /** * gpiochip_irq_map() - maps an IRQ into a GPIO irqchip * @d: the irqdomain used by this irqchip * @irq: the global irq number used by this GPIO irqchip irq * @hwirq: the local IRQ/GPIO line offset on this gpiochip * * This function will set up the mapping for a certain IRQ line on a * gpiochip by assigning the gpiochip as chip data, and using the irqchip * stored inside the gpiochip. */ int gpiochip_irq_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hwirq) { struct gpio_chip *gc = d->host_data; int ret = 0; if (!gpiochip_irqchip_irq_valid(gc, hwirq)) return -ENXIO; irq_set_chip_data(irq, gc); /* * This lock class tells lockdep that GPIO irqs are in a different * category than their parents, so it won't report false recursion. */ irq_set_lockdep_class(irq, gc->irq.lock_key, gc->irq.request_key); irq_set_chip_and_handler(irq, gc->irq.chip, gc->irq.handler); /* Chips that use nested thread handlers have them marked */ if (gc->irq.threaded) irq_set_nested_thread(irq, 1); irq_set_noprobe(irq); if (gc->irq.num_parents == 1) ret = irq_set_parent(irq, gc->irq.parents[0]); else if (gc->irq.map) ret = irq_set_parent(irq, gc->irq.map[hwirq]); if (ret < 0) return ret; /* * No set-up of the hardware will happen if IRQ_TYPE_NONE * is passed as default type. */ if (gc->irq.default_type != IRQ_TYPE_NONE) irq_set_irq_type(irq, gc->irq.default_type); return 0; } EXPORT_SYMBOL_GPL(gpiochip_irq_map); void gpiochip_irq_unmap(struct irq_domain *d, unsigned int irq) { struct gpio_chip *gc = d->host_data; if (gc->irq.threaded) irq_set_nested_thread(irq, 0); irq_set_chip_and_handler(irq, NULL, NULL); irq_set_chip_data(irq, NULL); } EXPORT_SYMBOL_GPL(gpiochip_irq_unmap); static const struct irq_domain_ops gpiochip_domain_ops = { .map = gpiochip_irq_map, .unmap = gpiochip_irq_unmap, /* Virtually all GPIO irqchips are twocell:ed */ .xlate = irq_domain_xlate_twocell, }; static struct irq_domain *gpiochip_simple_create_domain(struct gpio_chip *gc) { struct fwnode_handle *fwnode = dev_fwnode(&gc->gpiodev->dev); struct irq_domain *domain; domain = irq_domain_create_simple(fwnode, gc->ngpio, gc->irq.first, &gpiochip_domain_ops, gc); if (!domain) return ERR_PTR(-EINVAL); return domain; } /* * TODO: move these activate/deactivate in under the hierarchicial * irqchip implementation as static once SPMI and SSBI (all external * users) are phased over. */ /** * gpiochip_irq_domain_activate() - Lock a GPIO to be used as an IRQ * @domain: The IRQ domain used by this IRQ chip * @data: Outermost irq_data associated with the IRQ * @reserve: If set, only reserve an interrupt vector instead of assigning one * * This function is a wrapper that calls gpiochip_lock_as_irq() and is to be * used as the activate function for the &struct irq_domain_ops. The host_data * for the IRQ domain must be the &struct gpio_chip. */ int gpiochip_irq_domain_activate(struct irq_domain *domain, struct irq_data *data, bool reserve) { struct gpio_chip *gc = domain->host_data; unsigned int hwirq = irqd_to_hwirq(data); return gpiochip_lock_as_irq(gc, hwirq); } EXPORT_SYMBOL_GPL(gpiochip_irq_domain_activate); /** * gpiochip_irq_domain_deactivate() - Unlock a GPIO used as an IRQ * @domain: The IRQ domain used by this IRQ chip * @data: Outermost irq_data associated with the IRQ * * This function is a wrapper that will call gpiochip_unlock_as_irq() and is to * be used as the deactivate function for the &struct irq_domain_ops. The * host_data for the IRQ domain must be the &struct gpio_chip. */ void gpiochip_irq_domain_deactivate(struct irq_domain *domain, struct irq_data *data) { struct gpio_chip *gc = domain->host_data; unsigned int hwirq = irqd_to_hwirq(data); return gpiochip_unlock_as_irq(gc, hwirq); } EXPORT_SYMBOL_GPL(gpiochip_irq_domain_deactivate); static int gpiochip_to_irq(struct gpio_chip *gc, unsigned int offset) { struct irq_domain *domain = gc->irq.domain; #ifdef CONFIG_GPIOLIB_IRQCHIP /* * Avoid race condition with other code, which tries to lookup * an IRQ before the irqchip has been properly registered, * i.e. while gpiochip is still being brought up. */ if (!gc->irq.initialized) return -EPROBE_DEFER; #endif if (!gpiochip_irqchip_irq_valid(gc, offset)) return -ENXIO; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (irq_domain_is_hierarchy(domain)) { struct irq_fwspec spec; spec.fwnode = domain->fwnode; spec.param_count = 2; spec.param[0] = gc->irq.child_offset_to_irq(gc, offset); spec.param[1] = IRQ_TYPE_NONE; return irq_create_fwspec_mapping(&spec); } #endif return irq_create_mapping(domain, offset); } int gpiochip_irq_reqres(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); unsigned int hwirq = irqd_to_hwirq(d); return gpiochip_reqres_irq(gc, hwirq); } EXPORT_SYMBOL(gpiochip_irq_reqres); void gpiochip_irq_relres(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); unsigned int hwirq = irqd_to_hwirq(d); gpiochip_relres_irq(gc, hwirq); } EXPORT_SYMBOL(gpiochip_irq_relres); static void gpiochip_irq_mask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); unsigned int hwirq = irqd_to_hwirq(d); if (gc->irq.irq_mask) gc->irq.irq_mask(d); gpiochip_disable_irq(gc, hwirq); } static void gpiochip_irq_unmask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); unsigned int hwirq = irqd_to_hwirq(d); gpiochip_enable_irq(gc, hwirq); if (gc->irq.irq_unmask) gc->irq.irq_unmask(d); } static void gpiochip_irq_enable(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); unsigned int hwirq = irqd_to_hwirq(d); gpiochip_enable_irq(gc, hwirq); gc->irq.irq_enable(d); } static void gpiochip_irq_disable(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); unsigned int hwirq = irqd_to_hwirq(d); gc->irq.irq_disable(d); gpiochip_disable_irq(gc, hwirq); } static void gpiochip_set_irq_hooks(struct gpio_chip *gc) { struct irq_chip *irqchip = gc->irq.chip; if (irqchip->flags & IRQCHIP_IMMUTABLE) return; chip_warn(gc, "not an immutable chip, please consider fixing it!\n"); if (!irqchip->irq_request_resources && !irqchip->irq_release_resources) { irqchip->irq_request_resources = gpiochip_irq_reqres; irqchip->irq_release_resources = gpiochip_irq_relres; } if (WARN_ON(gc->irq.irq_enable)) return; /* Check if the irqchip already has this hook... */ if (irqchip->irq_enable == gpiochip_irq_enable || irqchip->irq_mask == gpiochip_irq_mask) { /* * ...and if so, give a gentle warning that this is bad * practice. */ chip_info(gc, "detected irqchip that is shared with multiple gpiochips: please fix the driver.\n"); return; } if (irqchip->irq_disable) { gc->irq.irq_disable = irqchip->irq_disable; irqchip->irq_disable = gpiochip_irq_disable; } else { gc->irq.irq_mask = irqchip->irq_mask; irqchip->irq_mask = gpiochip_irq_mask; } if (irqchip->irq_enable) { gc->irq.irq_enable = irqchip->irq_enable; irqchip->irq_enable = gpiochip_irq_enable; } else { gc->irq.irq_unmask = irqchip->irq_unmask; irqchip->irq_unmask = gpiochip_irq_unmask; } } static int gpiochip_irqchip_add_allocated_domain(struct gpio_chip *gc, struct irq_domain *domain, bool allocated_externally) { if (!domain) return -EINVAL; if (gc->to_irq) chip_warn(gc, "to_irq is redefined in %s and you shouldn't rely on it\n", __func__); gc->to_irq = gpiochip_to_irq; gc->irq.domain = domain; gc->irq.domain_is_allocated_externally = allocated_externally; /* * Using barrier() here to prevent compiler from reordering * gc->irq.initialized before adding irqdomain. */ barrier(); gc->irq.initialized = true; return 0; } /** * gpiochip_add_irqchip() - adds an IRQ chip to a GPIO chip * @gc: the GPIO chip to add the IRQ chip to * @lock_key: lockdep class for IRQ lock * @request_key: lockdep class for IRQ request */ static int gpiochip_add_irqchip(struct gpio_chip *gc, struct lock_class_key *lock_key, struct lock_class_key *request_key) { struct fwnode_handle *fwnode = dev_fwnode(&gc->gpiodev->dev); struct irq_chip *irqchip = gc->irq.chip; struct irq_domain *domain; unsigned int type; unsigned int i; int ret; if (!irqchip) return 0; if (gc->irq.parent_handler && gc->can_sleep) { chip_err(gc, "you cannot have chained interrupts on a chip that may sleep\n"); return -EINVAL; } type = gc->irq.default_type; /* * Specifying a default trigger is a terrible idea if DT or ACPI is * used to configure the interrupts, as you may end up with * conflicting triggers. Tell the user, and reset to NONE. */ if (WARN(fwnode && type != IRQ_TYPE_NONE, "%pfw: Ignoring %u default trigger\n", fwnode, type)) type = IRQ_TYPE_NONE; gc->irq.default_type = type; gc->irq.lock_key = lock_key; gc->irq.request_key = request_key; /* If a parent irqdomain is provided, let's build a hierarchy */ if (gpiochip_hierarchy_is_hierarchical(gc)) { domain = gpiochip_hierarchy_create_domain(gc); } else { domain = gpiochip_simple_create_domain(gc); } if (IS_ERR(domain)) return PTR_ERR(domain); if (gc->irq.parent_handler) { for (i = 0; i < gc->irq.num_parents; i++) { void *data; if (gc->irq.per_parent_data) data = gc->irq.parent_handler_data_array[i]; else data = gc->irq.parent_handler_data ?: gc; /* * The parent IRQ chip is already using the chip_data * for this IRQ chip, so our callbacks simply use the * handler_data. */ irq_set_chained_handler_and_data(gc->irq.parents[i], gc->irq.parent_handler, data); } } gpiochip_set_irq_hooks(gc); ret = gpiochip_irqchip_add_allocated_domain(gc, domain, false); if (ret) return ret; acpi_gpiochip_request_interrupts(gc); return 0; } /** * gpiochip_irqchip_remove() - removes an irqchip added to a gpiochip * @gc: the gpiochip to remove the irqchip from * * This is called only from gpiochip_remove() */ static void gpiochip_irqchip_remove(struct gpio_chip *gc) { struct irq_chip *irqchip = gc->irq.chip; unsigned int offset; acpi_gpiochip_free_interrupts(gc); if (irqchip && gc->irq.parent_handler) { struct gpio_irq_chip *irq = &gc->irq; unsigned int i; for (i = 0; i < irq->num_parents; i++) irq_set_chained_handler_and_data(irq->parents[i], NULL, NULL); } /* Remove all IRQ mappings and delete the domain */ if (!gc->irq.domain_is_allocated_externally && gc->irq.domain) { unsigned int irq; for (offset = 0; offset < gc->ngpio; offset++) { if (!gpiochip_irqchip_irq_valid(gc, offset)) continue; irq = irq_find_mapping(gc->irq.domain, offset); irq_dispose_mapping(irq); } irq_domain_remove(gc->irq.domain); } if (irqchip && !(irqchip->flags & IRQCHIP_IMMUTABLE)) { if (irqchip->irq_request_resources == gpiochip_irq_reqres) { irqchip->irq_request_resources = NULL; irqchip->irq_release_resources = NULL; } if (irqchip->irq_enable == gpiochip_irq_enable) { irqchip->irq_enable = gc->irq.irq_enable; irqchip->irq_disable = gc->irq.irq_disable; } } gc->irq.irq_enable = NULL; gc->irq.irq_disable = NULL; gc->irq.chip = NULL; gpiochip_irqchip_free_valid_mask(gc); } /** * gpiochip_irqchip_add_domain() - adds an irqdomain to a gpiochip * @gc: the gpiochip to add the irqchip to * @domain: the irqdomain to add to the gpiochip * * This function adds an IRQ domain to the gpiochip. */ int gpiochip_irqchip_add_domain(struct gpio_chip *gc, struct irq_domain *domain) { return gpiochip_irqchip_add_allocated_domain(gc, domain, true); } EXPORT_SYMBOL_GPL(gpiochip_irqchip_add_domain); #else /* CONFIG_GPIOLIB_IRQCHIP */ static inline int gpiochip_add_irqchip(struct gpio_chip *gc, struct lock_class_key *lock_key, struct lock_class_key *request_key) { return 0; } static void gpiochip_irqchip_remove(struct gpio_chip *gc) {} static inline int gpiochip_irqchip_init_hw(struct gpio_chip *gc) { return 0; } static inline int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gc) { return 0; } static inline void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gc) { } #endif /* CONFIG_GPIOLIB_IRQCHIP */ /** * gpiochip_generic_request() - request the gpio function for a pin * @gc: the gpiochip owning the GPIO * @offset: the offset of the GPIO to request for GPIO function */ int gpiochip_generic_request(struct gpio_chip *gc, unsigned int offset) { #ifdef CONFIG_PINCTRL if (list_empty(&gc->gpiodev->pin_ranges)) return 0; #endif return pinctrl_gpio_request(gc, offset); } EXPORT_SYMBOL_GPL(gpiochip_generic_request); /** * gpiochip_generic_free() - free the gpio function from a pin * @gc: the gpiochip to request the gpio function for * @offset: the offset of the GPIO to free from GPIO function */ void gpiochip_generic_free(struct gpio_chip *gc, unsigned int offset) { #ifdef CONFIG_PINCTRL if (list_empty(&gc->gpiodev->pin_ranges)) return; #endif pinctrl_gpio_free(gc, offset); } EXPORT_SYMBOL_GPL(gpiochip_generic_free); /** * gpiochip_generic_config() - apply configuration for a pin * @gc: the gpiochip owning the GPIO * @offset: the offset of the GPIO to apply the configuration * @config: the configuration to be applied */ int gpiochip_generic_config(struct gpio_chip *gc, unsigned int offset, unsigned long config) { return pinctrl_gpio_set_config(gc, offset, config); } EXPORT_SYMBOL_GPL(gpiochip_generic_config); #ifdef CONFIG_PINCTRL /** * gpiochip_add_pingroup_range() - add a range for GPIO <-> pin mapping * @gc: the gpiochip to add the range for * @pctldev: the pin controller to map to * @gpio_offset: the start offset in the current gpio_chip number space * @pin_group: name of the pin group inside the pin controller * * Calling this function directly from a DeviceTree-supported * pinctrl driver is DEPRECATED. Please see Section 2.1 of * Documentation/devicetree/bindings/gpio/gpio.txt on how to * bind pinctrl and gpio drivers via the "gpio-ranges" property. */ int gpiochip_add_pingroup_range(struct gpio_chip *gc, struct pinctrl_dev *pctldev, unsigned int gpio_offset, const char *pin_group) { struct gpio_pin_range *pin_range; struct gpio_device *gdev = gc->gpiodev; int ret; pin_range = kzalloc(sizeof(*pin_range), GFP_KERNEL); if (!pin_range) { chip_err(gc, "failed to allocate pin ranges\n"); return -ENOMEM; } /* Use local offset as range ID */ pin_range->range.id = gpio_offset; pin_range->range.gc = gc; pin_range->range.name = gc->label; pin_range->range.base = gdev->base + gpio_offset; pin_range->pctldev = pctldev; ret = pinctrl_get_group_pins(pctldev, pin_group, &pin_range->range.pins, &pin_range->range.npins); if (ret < 0) { kfree(pin_range); return ret; } pinctrl_add_gpio_range(pctldev, &pin_range->range); chip_dbg(gc, "created GPIO range %d->%d ==> %s PINGRP %s\n", gpio_offset, gpio_offset + pin_range->range.npins - 1, pinctrl_dev_get_devname(pctldev), pin_group); list_add_tail(&pin_range->node, &gdev->pin_ranges); return 0; } EXPORT_SYMBOL_GPL(gpiochip_add_pingroup_range); /** * gpiochip_add_pin_range() - add a range for GPIO <-> pin mapping * @gc: the gpiochip to add the range for * @pinctl_name: the dev_name() of the pin controller to map to * @gpio_offset: the start offset in the current gpio_chip number space * @pin_offset: the start offset in the pin controller number space * @npins: the number of pins from the offset of each pin space (GPIO and * pin controller) to accumulate in this range * * Returns: * 0 on success, or a negative error-code on failure. * * Calling this function directly from a DeviceTree-supported * pinctrl driver is DEPRECATED. Please see Section 2.1 of * Documentation/devicetree/bindings/gpio/gpio.txt on how to * bind pinctrl and gpio drivers via the "gpio-ranges" property. */ int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, unsigned int gpio_offset, unsigned int pin_offset, unsigned int npins) { struct gpio_pin_range *pin_range; struct gpio_device *gdev = gc->gpiodev; int ret; pin_range = kzalloc(sizeof(*pin_range), GFP_KERNEL); if (!pin_range) { chip_err(gc, "failed to allocate pin ranges\n"); return -ENOMEM; } /* Use local offset as range ID */ pin_range->range.id = gpio_offset; pin_range->range.gc = gc; pin_range->range.name = gc->label; pin_range->range.base = gdev->base + gpio_offset; pin_range->range.pin_base = pin_offset; pin_range->range.npins = npins; pin_range->pctldev = pinctrl_find_and_add_gpio_range(pinctl_name, &pin_range->range); if (IS_ERR(pin_range->pctldev)) { ret = PTR_ERR(pin_range->pctldev); chip_err(gc, "could not create pin range\n"); kfree(pin_range); return ret; } chip_dbg(gc, "created GPIO range %d->%d ==> %s PIN %d->%d\n", gpio_offset, gpio_offset + npins - 1, pinctl_name, pin_offset, pin_offset + npins - 1); list_add_tail(&pin_range->node, &gdev->pin_ranges); return 0; } EXPORT_SYMBOL_GPL(gpiochip_add_pin_range); /** * gpiochip_remove_pin_ranges() - remove all the GPIO <-> pin mappings * @gc: the chip to remove all the mappings for */ void gpiochip_remove_pin_ranges(struct gpio_chip *gc) { struct gpio_pin_range *pin_range, *tmp; struct gpio_device *gdev = gc->gpiodev; list_for_each_entry_safe(pin_range, tmp, &gdev->pin_ranges, node) { list_del(&pin_range->node); pinctrl_remove_gpio_range(pin_range->pctldev, &pin_range->range); kfree(pin_range); } } EXPORT_SYMBOL_GPL(gpiochip_remove_pin_ranges); #endif /* CONFIG_PINCTRL */ /* These "optional" allocation calls help prevent drivers from stomping * on each other, and help provide better diagnostics in debugfs. * They're called even less than the "set direction" calls. */ static int gpiod_request_commit(struct gpio_desc *desc, const char *label) { struct gpio_chip *gc = desc->gdev->chip; unsigned long flags; unsigned int offset; int ret; if (label) { label = kstrdup_const(label, GFP_KERNEL); if (!label) return -ENOMEM; } spin_lock_irqsave(&gpio_lock, flags); /* NOTE: gpio_request() can be called in early boot, * before IRQs are enabled, for non-sleeping (SOC) GPIOs. */ if (test_and_set_bit(FLAG_REQUESTED, &desc->flags) == 0) { desc_set_label(desc, label ? : "?"); } else { ret = -EBUSY; goto out_free_unlock; } if (gc->request) { /* gc->request may sleep */ spin_unlock_irqrestore(&gpio_lock, flags); offset = gpio_chip_hwgpio(desc); if (gpiochip_line_is_valid(gc, offset)) ret = gc->request(gc, offset); else ret = -EINVAL; spin_lock_irqsave(&gpio_lock, flags); if (ret) { desc_set_label(desc, NULL); clear_bit(FLAG_REQUESTED, &desc->flags); goto out_free_unlock; } } if (gc->get_direction) { /* gc->get_direction may sleep */ spin_unlock_irqrestore(&gpio_lock, flags); gpiod_get_direction(desc); spin_lock_irqsave(&gpio_lock, flags); } spin_unlock_irqrestore(&gpio_lock, flags); return 0; out_free_unlock: spin_unlock_irqrestore(&gpio_lock, flags); kfree_const(label); return ret; } /* * This descriptor validation needs to be inserted verbatim into each * function taking a descriptor, so we need to use a preprocessor * macro to avoid endless duplication. If the desc is NULL it is an * optional GPIO and calls should just bail out. */ static int validate_desc(const struct gpio_desc *desc, const char *func) { if (!desc) return 0; if (IS_ERR(desc)) { pr_warn("%s: invalid GPIO (errorpointer)\n", func); return PTR_ERR(desc); } if (!desc->gdev) { pr_warn("%s: invalid GPIO (no device)\n", func); return -EINVAL; } if (!desc->gdev->chip) { dev_warn(&desc->gdev->dev, "%s: backing chip is gone\n", func); return 0; } return 1; } #define VALIDATE_DESC(desc) do { \ int __valid = validate_desc(desc, __func__); \ if (__valid <= 0) \ return __valid; \ } while (0) #define VALIDATE_DESC_VOID(desc) do { \ int __valid = validate_desc(desc, __func__); \ if (__valid <= 0) \ return; \ } while (0) int gpiod_request(struct gpio_desc *desc, const char *label) { int ret = -EPROBE_DEFER; VALIDATE_DESC(desc); if (try_module_get(desc->gdev->owner)) { ret = gpiod_request_commit(desc, label); if (ret) module_put(desc->gdev->owner); else gpio_device_get(desc->gdev); } if (ret) gpiod_dbg(desc, "%s: status %d\n", __func__, ret); return ret; } static bool gpiod_free_commit(struct gpio_desc *desc) { struct gpio_chip *gc; unsigned long flags; bool ret = false; might_sleep(); spin_lock_irqsave(&gpio_lock, flags); gc = desc->gdev->chip; if (gc && test_bit(FLAG_REQUESTED, &desc->flags)) { if (gc->free) { spin_unlock_irqrestore(&gpio_lock, flags); might_sleep_if(gc->can_sleep); gc->free(gc, gpio_chip_hwgpio(desc)); spin_lock_irqsave(&gpio_lock, flags); } kfree_const(desc->label); desc_set_label(desc, NULL); clear_bit(FLAG_ACTIVE_LOW, &desc->flags); clear_bit(FLAG_REQUESTED, &desc->flags); clear_bit(FLAG_OPEN_DRAIN, &desc->flags); clear_bit(FLAG_OPEN_SOURCE, &desc->flags); clear_bit(FLAG_PULL_UP, &desc->flags); clear_bit(FLAG_PULL_DOWN, &desc->flags); clear_bit(FLAG_BIAS_DISABLE, &desc->flags); clear_bit(FLAG_EDGE_RISING, &desc->flags); clear_bit(FLAG_EDGE_FALLING, &desc->flags); clear_bit(FLAG_IS_HOGGED, &desc->flags); #ifdef CONFIG_OF_DYNAMIC desc->hog = NULL; #endif ret = true; } spin_unlock_irqrestore(&gpio_lock, flags); gpiod_line_state_notify(desc, GPIOLINE_CHANGED_RELEASED); return ret; } void gpiod_free(struct gpio_desc *desc) { /* * We must not use VALIDATE_DESC_VOID() as the underlying gdev->chip * may already be NULL but we still want to put the references. */ if (!desc) return; if (!gpiod_free_commit(desc)) WARN_ON(1); module_put(desc->gdev->owner); gpio_device_put(desc->gdev); } /** * gpiochip_dup_line_label - Get a copy of the consumer label. * @gc: GPIO chip controlling this line. * @offset: Hardware offset of the line. * * Returns: * Pointer to a copy of the consumer label if the line is requested or NULL * if it's not. If a valid pointer was returned, it must be freed using * kfree(). In case of a memory allocation error, the function returns %ENOMEM. * * Must not be called from atomic context. */ char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset) { struct gpio_desc *desc; char *label; desc = gpiochip_get_desc(gc, offset); if (IS_ERR(desc)) return NULL; guard(spinlock_irqsave)(&gpio_lock); if (!test_bit(FLAG_REQUESTED, &desc->flags)) return NULL; /* * FIXME: Once we mark gpiod_direction_input/output() and * gpiod_get_direction() with might_sleep(), we'll be able to protect * the GPIO descriptors with mutex (while value setting operations will * become lockless). * * Until this happens, this allocation needs to be atomic. */ label = kstrdup(desc->label, GFP_ATOMIC); if (!label) return ERR_PTR(-ENOMEM); return label; } EXPORT_SYMBOL_GPL(gpiochip_dup_line_label); /** * gpiochip_request_own_desc - Allow GPIO chip to request its own descriptor * @gc: GPIO chip * @hwnum: hardware number of the GPIO for which to request the descriptor * @label: label for the GPIO * @lflags: lookup flags for this GPIO or 0 if default, this can be used to * specify things like line inversion semantics with the machine flags * such as GPIO_OUT_LOW * @dflags: descriptor request flags for this GPIO or 0 if default, this * can be used to specify consumer semantics such as open drain * * Function allows GPIO chip drivers to request and use their own GPIO * descriptors via gpiolib API. Difference to gpiod_request() is that this * function will not increase reference count of the GPIO chip module. This * allows the GPIO chip module to be unloaded as needed (we assume that the * GPIO chip driver handles freeing the GPIOs it has requested). * * Returns: * A pointer to the GPIO descriptor, or an ERR_PTR()-encoded negative error * code on failure. */ struct gpio_desc *gpiochip_request_own_desc(struct gpio_chip *gc, unsigned int hwnum, const char *label, enum gpio_lookup_flags lflags, enum gpiod_flags dflags) { struct gpio_desc *desc = gpiochip_get_desc(gc, hwnum); int ret; if (IS_ERR(desc)) { chip_err(gc, "failed to get GPIO descriptor\n"); return desc; } ret = gpiod_request_commit(desc, label); if (ret < 0) return ERR_PTR(ret); ret = gpiod_configure_flags(desc, label, lflags, dflags); if (ret) { chip_err(gc, "setup of own GPIO %s failed\n", label); gpiod_free_commit(desc); return ERR_PTR(ret); } return desc; } EXPORT_SYMBOL_GPL(gpiochip_request_own_desc); /** * gpiochip_free_own_desc - Free GPIO requested by the chip driver * @desc: GPIO descriptor to free * * Function frees the given GPIO requested previously with * gpiochip_request_own_desc(). */ void gpiochip_free_own_desc(struct gpio_desc *desc) { if (desc) gpiod_free_commit(desc); } EXPORT_SYMBOL_GPL(gpiochip_free_own_desc); /* * Drivers MUST set GPIO direction before making get/set calls. In * some cases this is done in early boot, before IRQs are enabled. * * As a rule these aren't called more than once (except for drivers * using the open-drain emulation idiom) so these are natural places * to accumulate extra debugging checks. Note that we can't (yet) * rely on gpio_request() having been called beforehand. */ static int gpio_do_set_config(struct gpio_chip *gc, unsigned int offset, unsigned long config) { if (!gc->set_config) return -ENOTSUPP; return gc->set_config(gc, offset, config); } static int gpio_set_config_with_argument(struct gpio_desc *desc, enum pin_config_param mode, u32 argument) { struct gpio_chip *gc = desc->gdev->chip; unsigned long config; config = pinconf_to_config_packed(mode, argument); return gpio_do_set_config(gc, gpio_chip_hwgpio(desc), config); } static int gpio_set_config_with_argument_optional(struct gpio_desc *desc, enum pin_config_param mode, u32 argument) { struct device *dev = &desc->gdev->dev; int gpio = gpio_chip_hwgpio(desc); int ret; ret = gpio_set_config_with_argument(desc, mode, argument); if (ret != -ENOTSUPP) return ret; switch (mode) { case PIN_CONFIG_PERSIST_STATE: dev_dbg(dev, "Persistence not supported for GPIO %d\n", gpio); break; default: break; } return 0; } static int gpio_set_config(struct gpio_desc *desc, enum pin_config_param mode) { return gpio_set_config_with_argument(desc, mode, 0); } static int gpio_set_bias(struct gpio_desc *desc) { enum pin_config_param bias; unsigned int arg; if (test_bit(FLAG_BIAS_DISABLE, &desc->flags)) bias = PIN_CONFIG_BIAS_DISABLE; else if (test_bit(FLAG_PULL_UP, &desc->flags)) bias = PIN_CONFIG_BIAS_PULL_UP; else if (test_bit(FLAG_PULL_DOWN, &desc->flags)) bias = PIN_CONFIG_BIAS_PULL_DOWN; else return 0; switch (bias) { case PIN_CONFIG_BIAS_PULL_DOWN: case PIN_CONFIG_BIAS_PULL_UP: arg = 1; break; default: arg = 0; break; } return gpio_set_config_with_argument_optional(desc, bias, arg); } /** * gpio_set_debounce_timeout() - Set debounce timeout * @desc: GPIO descriptor to set the debounce timeout * @debounce: Debounce timeout in microseconds * * The function calls the certain GPIO driver to set debounce timeout * in the hardware. * * Returns 0 on success, or negative error code otherwise. */ int gpio_set_debounce_timeout(struct gpio_desc *desc, unsigned int debounce) { return gpio_set_config_with_argument_optional(desc, PIN_CONFIG_INPUT_DEBOUNCE, debounce); } /** * gpiod_direction_input - set the GPIO direction to input * @desc: GPIO to set to input * * Set the direction of the passed GPIO to input, such as gpiod_get_value() can * be called safely on it. * * Return 0 in case of success, else an error code. */ int gpiod_direction_input(struct gpio_desc *desc) { struct gpio_chip *gc; int ret = 0; VALIDATE_DESC(desc); gc = desc->gdev->chip; /* * It is legal to have no .get() and .direction_input() specified if * the chip is output-only, but you can't specify .direction_input() * and not support the .get() operation, that doesn't make sense. */ if (!gc->get && gc->direction_input) { gpiod_warn(desc, "%s: missing get() but have direction_input()\n", __func__); return -EIO; } /* * If we have a .direction_input() callback, things are simple, * just call it. Else we are some input-only chip so try to check the * direction (if .get_direction() is supported) else we silently * assume we are in input mode after this. */ if (gc->direction_input) { ret = gc->direction_input(gc, gpio_chip_hwgpio(desc)); } else if (gc->get_direction && (gc->get_direction(gc, gpio_chip_hwgpio(desc)) != 1)) { gpiod_warn(desc, "%s: missing direction_input() operation and line is output\n", __func__); return -EIO; } if (ret == 0) { clear_bit(FLAG_IS_OUT, &desc->flags); ret = gpio_set_bias(desc); } trace_gpio_direction(desc_to_gpio(desc), 1, ret); return ret; } EXPORT_SYMBOL_GPL(gpiod_direction_input); static int gpiod_direction_output_raw_commit(struct gpio_desc *desc, int value) { struct gpio_chip *gc = desc->gdev->chip; int val = !!value; int ret = 0; /* * It's OK not to specify .direction_output() if the gpiochip is * output-only, but if there is then not even a .set() operation it * is pretty tricky to drive the output line. */ if (!gc->set && !gc->direction_output) { gpiod_warn(desc, "%s: missing set() and direction_output() operations\n", __func__); return -EIO; } if (gc->direction_output) { ret = gc->direction_output(gc, gpio_chip_hwgpio(desc), val); } else { /* Check that we are in output mode if we can */ if (gc->get_direction && gc->get_direction(gc, gpio_chip_hwgpio(desc))) { gpiod_warn(desc, "%s: missing direction_output() operation\n", __func__); return -EIO; } /* * If we can't actively set the direction, we are some * output-only chip, so just drive the output as desired. */ gc->set(gc, gpio_chip_hwgpio(desc), val); } if (!ret) set_bit(FLAG_IS_OUT, &desc->flags); trace_gpio_value(desc_to_gpio(desc), 0, val); trace_gpio_direction(desc_to_gpio(desc), 0, ret); return ret; } /** * gpiod_direction_output_raw - set the GPIO direction to output * @desc: GPIO to set to output * @value: initial output value of the GPIO * * Set the direction of the passed GPIO to output, such as gpiod_set_value() can * be called safely on it. The initial value of the output must be specified * as raw value on the physical line without regard for the ACTIVE_LOW status. * * Return 0 in case of success, else an error code. */ int gpiod_direction_output_raw(struct gpio_desc *desc, int value) { VALIDATE_DESC(desc); return gpiod_direction_output_raw_commit(desc, value); } EXPORT_SYMBOL_GPL(gpiod_direction_output_raw); /** * gpiod_direction_output - set the GPIO direction to output * @desc: GPIO to set to output * @value: initial output value of the GPIO * * Set the direction of the passed GPIO to output, such as gpiod_set_value() can * be called safely on it. The initial value of the output must be specified * as the logical value of the GPIO, i.e. taking its ACTIVE_LOW status into * account. * * Return 0 in case of success, else an error code. */ int gpiod_direction_output(struct gpio_desc *desc, int value) { int ret; VALIDATE_DESC(desc); if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) value = !value; else value = !!value; /* GPIOs used for enabled IRQs shall not be set as output */ if (test_bit(FLAG_USED_AS_IRQ, &desc->flags) && test_bit(FLAG_IRQ_IS_ENABLED, &desc->flags)) { gpiod_err(desc, "%s: tried to set a GPIO tied to an IRQ as output\n", __func__); return -EIO; } if (test_bit(FLAG_OPEN_DRAIN, &desc->flags)) { /* First see if we can enable open drain in hardware */ ret = gpio_set_config(desc, PIN_CONFIG_DRIVE_OPEN_DRAIN); if (!ret) goto set_output_value; /* Emulate open drain by not actively driving the line high */ if (value) { ret = gpiod_direction_input(desc); goto set_output_flag; } } else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags)) { ret = gpio_set_config(desc, PIN_CONFIG_DRIVE_OPEN_SOURCE); if (!ret) goto set_output_value; /* Emulate open source by not actively driving the line low */ if (!value) { ret = gpiod_direction_input(desc); goto set_output_flag; } } else { gpio_set_config(desc, PIN_CONFIG_DRIVE_PUSH_PULL); } set_output_value: ret = gpio_set_bias(desc); if (ret) return ret; return gpiod_direction_output_raw_commit(desc, value); set_output_flag: /* * When emulating open-source or open-drain functionalities by not * actively driving the line (setting mode to input) we still need to * set the IS_OUT flag or otherwise we won't be able to set the line * value anymore. */ if (ret == 0) set_bit(FLAG_IS_OUT, &desc->flags); return ret; } EXPORT_SYMBOL_GPL(gpiod_direction_output); /** * gpiod_enable_hw_timestamp_ns - Enable hardware timestamp in nanoseconds. * * @desc: GPIO to enable. * @flags: Flags related to GPIO edge. * * Return 0 in case of success, else negative error code. */ int gpiod_enable_hw_timestamp_ns(struct gpio_desc *desc, unsigned long flags) { int ret = 0; struct gpio_chip *gc; VALIDATE_DESC(desc); gc = desc->gdev->chip; if (!gc->en_hw_timestamp) { gpiod_warn(desc, "%s: hw ts not supported\n", __func__); return -ENOTSUPP; } ret = gc->en_hw_timestamp(gc, gpio_chip_hwgpio(desc), flags); if (ret) gpiod_warn(desc, "%s: hw ts request failed\n", __func__); return ret; } EXPORT_SYMBOL_GPL(gpiod_enable_hw_timestamp_ns); /** * gpiod_disable_hw_timestamp_ns - Disable hardware timestamp. * * @desc: GPIO to disable. * @flags: Flags related to GPIO edge, same value as used during enable call. * * Return 0 in case of success, else negative error code. */ int gpiod_disable_hw_timestamp_ns(struct gpio_desc *desc, unsigned long flags) { int ret = 0; struct gpio_chip *gc; VALIDATE_DESC(desc); gc = desc->gdev->chip; if (!gc->dis_hw_timestamp) { gpiod_warn(desc, "%s: hw ts not supported\n", __func__); return -ENOTSUPP; } ret = gc->dis_hw_timestamp(gc, gpio_chip_hwgpio(desc), flags); if (ret) gpiod_warn(desc, "%s: hw ts release failed\n", __func__); return ret; } EXPORT_SYMBOL_GPL(gpiod_disable_hw_timestamp_ns); /** * gpiod_set_config - sets @config for a GPIO * @desc: descriptor of the GPIO for which to set the configuration * @config: Same packed config format as generic pinconf * * Returns: * 0 on success, %-ENOTSUPP if the controller doesn't support setting the * configuration. */ int gpiod_set_config(struct gpio_desc *desc, unsigned long config) { struct gpio_chip *gc; VALIDATE_DESC(desc); gc = desc->gdev->chip; return gpio_do_set_config(gc, gpio_chip_hwgpio(desc), config); } EXPORT_SYMBOL_GPL(gpiod_set_config); /** * gpiod_set_debounce - sets @debounce time for a GPIO * @desc: descriptor of the GPIO for which to set debounce time * @debounce: debounce time in microseconds * * Returns: * 0 on success, %-ENOTSUPP if the controller doesn't support setting the * debounce time. */ int gpiod_set_debounce(struct gpio_desc *desc, unsigned int debounce) { unsigned long config; config = pinconf_to_config_packed(PIN_CONFIG_INPUT_DEBOUNCE, debounce); return gpiod_set_config(desc, config); } EXPORT_SYMBOL_GPL(gpiod_set_debounce); /** * gpiod_set_transitory - Lose or retain GPIO state on suspend or reset * @desc: descriptor of the GPIO for which to configure persistence * @transitory: True to lose state on suspend or reset, false for persistence * * Returns: * 0 on success, otherwise a negative error code. */ int gpiod_set_transitory(struct gpio_desc *desc, bool transitory) { VALIDATE_DESC(desc); /* * Handle FLAG_TRANSITORY first, enabling queries to gpiolib for * persistence state. */ assign_bit(FLAG_TRANSITORY, &desc->flags, transitory); /* If the driver supports it, set the persistence state now */ return gpio_set_config_with_argument_optional(desc, PIN_CONFIG_PERSIST_STATE, !transitory); } /** * gpiod_is_active_low - test whether a GPIO is active-low or not * @desc: the gpio descriptor to test * * Returns 1 if the GPIO is active-low, 0 otherwise. */ int gpiod_is_active_low(const struct gpio_desc *desc) { VALIDATE_DESC(desc); return test_bit(FLAG_ACTIVE_LOW, &desc->flags); } EXPORT_SYMBOL_GPL(gpiod_is_active_low); /** * gpiod_toggle_active_low - toggle whether a GPIO is active-low or not * @desc: the gpio descriptor to change */ void gpiod_toggle_active_low(struct gpio_desc *desc) { VALIDATE_DESC_VOID(desc); change_bit(FLAG_ACTIVE_LOW, &desc->flags); } EXPORT_SYMBOL_GPL(gpiod_toggle_active_low); static int gpio_chip_get_value(struct gpio_chip *gc, const struct gpio_desc *desc) { return gc->get ? gc->get(gc, gpio_chip_hwgpio(desc)) : -EIO; } /* I/O calls are only valid after configuration completed; the relevant * "is this a valid GPIO" error checks should already have been done. * * "Get" operations are often inlinable as reading a pin value register, * and masking the relevant bit in that register. * * When "set" operations are inlinable, they involve writing that mask to * one register to set a low value, or a different register to set it high. * Otherwise locking is needed, so there may be little value to inlining. * *------------------------------------------------------------------------ * * IMPORTANT!!! The hot paths -- get/set value -- assume that callers * have requested the GPIO. That can include implicit requesting by * a direction setting call. Marking a gpio as requested locks its chip * in memory, guaranteeing that these table lookups need no more locking * and that gpiochip_remove() will fail. * * REVISIT when debugging, consider adding some instrumentation to ensure * that the GPIO was actually requested. */ static int gpiod_get_raw_value_commit(const struct gpio_desc *desc) { struct gpio_chip *gc; int value; gc = desc->gdev->chip; value = gpio_chip_get_value(gc, desc); value = value < 0 ? value : !!value; trace_gpio_value(desc_to_gpio(desc), 1, value); return value; } static int gpio_chip_get_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { if (gc->get_multiple) return gc->get_multiple(gc, mask, bits); if (gc->get) { int i, value; for_each_set_bit(i, mask, gc->ngpio) { value = gc->get(gc, i); if (value < 0) return value; __assign_bit(i, bits, value); } return 0; } return -EIO; } int gpiod_get_array_value_complex(bool raw, bool can_sleep, unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { int ret, i = 0; /* * Validate array_info against desc_array and its size. * It should immediately follow desc_array if both * have been obtained from the same gpiod_get_array() call. */ if (array_info && array_info->desc == desc_array && array_size <= array_info->size && (void *)array_info == desc_array + array_info->size) { if (!can_sleep) WARN_ON(array_info->chip->can_sleep); ret = gpio_chip_get_multiple(array_info->chip, array_info->get_mask, value_bitmap); if (ret) return ret; if (!raw && !bitmap_empty(array_info->invert_mask, array_size)) bitmap_xor(value_bitmap, value_bitmap, array_info->invert_mask, array_size); i = find_first_zero_bit(array_info->get_mask, array_size); if (i == array_size) return 0; } else { array_info = NULL; } while (i < array_size) { struct gpio_chip *gc = desc_array[i]->gdev->chip; DECLARE_BITMAP(fastpath_mask, FASTPATH_NGPIO); DECLARE_BITMAP(fastpath_bits, FASTPATH_NGPIO); unsigned long *mask, *bits; int first, j; if (likely(gc->ngpio <= FASTPATH_NGPIO)) { mask = fastpath_mask; bits = fastpath_bits; } else { gfp_t flags = can_sleep ? GFP_KERNEL : GFP_ATOMIC; mask = bitmap_alloc(gc->ngpio, flags); if (!mask) return -ENOMEM; bits = bitmap_alloc(gc->ngpio, flags); if (!bits) { bitmap_free(mask); return -ENOMEM; } } bitmap_zero(mask, gc->ngpio); if (!can_sleep) WARN_ON(gc->can_sleep); /* collect all inputs belonging to the same chip */ first = i; do { const struct gpio_desc *desc = desc_array[i]; int hwgpio = gpio_chip_hwgpio(desc); __set_bit(hwgpio, mask); i++; if (array_info) i = find_next_zero_bit(array_info->get_mask, array_size, i); } while ((i < array_size) && (desc_array[i]->gdev->chip == gc)); ret = gpio_chip_get_multiple(gc, mask, bits); if (ret) { if (mask != fastpath_mask) bitmap_free(mask); if (bits != fastpath_bits) bitmap_free(bits); return ret; } for (j = first; j < i; ) { const struct gpio_desc *desc = desc_array[j]; int hwgpio = gpio_chip_hwgpio(desc); int value = test_bit(hwgpio, bits); if (!raw && test_bit(FLAG_ACTIVE_LOW, &desc->flags)) value = !value; __assign_bit(j, value_bitmap, value); trace_gpio_value(desc_to_gpio(desc), 1, value); j++; if (array_info) j = find_next_zero_bit(array_info->get_mask, i, j); } if (mask != fastpath_mask) bitmap_free(mask); if (bits != fastpath_bits) bitmap_free(bits); } return 0; } /** * gpiod_get_raw_value() - return a gpio's raw value * @desc: gpio whose value will be returned * * Return the GPIO's raw value, i.e. the value of the physical line disregarding * its ACTIVE_LOW status, or negative errno on failure. * * This function can be called from contexts where we cannot sleep, and will * complain if the GPIO chip functions potentially sleep. */ int gpiod_get_raw_value(const struct gpio_desc *desc) { VALIDATE_DESC(desc); /* Should be using gpiod_get_raw_value_cansleep() */ WARN_ON(desc->gdev->chip->can_sleep); return gpiod_get_raw_value_commit(desc); } EXPORT_SYMBOL_GPL(gpiod_get_raw_value); /** * gpiod_get_value() - return a gpio's value * @desc: gpio whose value will be returned * * Return the GPIO's logical value, i.e. taking the ACTIVE_LOW status into * account, or negative errno on failure. * * This function can be called from contexts where we cannot sleep, and will * complain if the GPIO chip functions potentially sleep. */ int gpiod_get_value(const struct gpio_desc *desc) { int value; VALIDATE_DESC(desc); /* Should be using gpiod_get_value_cansleep() */ WARN_ON(desc->gdev->chip->can_sleep); value = gpiod_get_raw_value_commit(desc); if (value < 0) return value; if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) value = !value; return value; } EXPORT_SYMBOL_GPL(gpiod_get_value); /** * gpiod_get_raw_array_value() - read raw values from an array of GPIOs * @array_size: number of elements in the descriptor array / value bitmap * @desc_array: array of GPIO descriptors whose values will be read * @array_info: information on applicability of fast bitmap processing path * @value_bitmap: bitmap to store the read values * * Read the raw values of the GPIOs, i.e. the values of the physical lines * without regard for their ACTIVE_LOW status. Return 0 in case of success, * else an error code. * * This function can be called from contexts where we cannot sleep, * and it will complain if the GPIO chip functions potentially sleep. */ int gpiod_get_raw_array_value(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { if (!desc_array) return -EINVAL; return gpiod_get_array_value_complex(true, false, array_size, desc_array, array_info, value_bitmap); } EXPORT_SYMBOL_GPL(gpiod_get_raw_array_value); /** * gpiod_get_array_value() - read values from an array of GPIOs * @array_size: number of elements in the descriptor array / value bitmap * @desc_array: array of GPIO descriptors whose values will be read * @array_info: information on applicability of fast bitmap processing path * @value_bitmap: bitmap to store the read values * * Read the logical values of the GPIOs, i.e. taking their ACTIVE_LOW status * into account. Return 0 in case of success, else an error code. * * This function can be called from contexts where we cannot sleep, * and it will complain if the GPIO chip functions potentially sleep. */ int gpiod_get_array_value(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { if (!desc_array) return -EINVAL; return gpiod_get_array_value_complex(false, false, array_size, desc_array, array_info, value_bitmap); } EXPORT_SYMBOL_GPL(gpiod_get_array_value); /* * gpio_set_open_drain_value_commit() - Set the open drain gpio's value. * @desc: gpio descriptor whose state need to be set. * @value: Non-zero for setting it HIGH otherwise it will set to LOW. */ static void gpio_set_open_drain_value_commit(struct gpio_desc *desc, bool value) { int ret = 0; struct gpio_chip *gc = desc->gdev->chip; int offset = gpio_chip_hwgpio(desc); if (value) { ret = gc->direction_input(gc, offset); } else { ret = gc->direction_output(gc, offset, 0); if (!ret) set_bit(FLAG_IS_OUT, &desc->flags); } trace_gpio_direction(desc_to_gpio(desc), value, ret); if (ret < 0) gpiod_err(desc, "%s: Error in set_value for open drain err %d\n", __func__, ret); } /* * _gpio_set_open_source_value() - Set the open source gpio's value. * @desc: gpio descriptor whose state need to be set. * @value: Non-zero for setting it HIGH otherwise it will set to LOW. */ static void gpio_set_open_source_value_commit(struct gpio_desc *desc, bool value) { int ret = 0; struct gpio_chip *gc = desc->gdev->chip; int offset = gpio_chip_hwgpio(desc); if (value) { ret = gc->direction_output(gc, offset, 1); if (!ret) set_bit(FLAG_IS_OUT, &desc->flags); } else { ret = gc->direction_input(gc, offset); } trace_gpio_direction(desc_to_gpio(desc), !value, ret); if (ret < 0) gpiod_err(desc, "%s: Error in set_value for open source err %d\n", __func__, ret); } static void gpiod_set_raw_value_commit(struct gpio_desc *desc, bool value) { struct gpio_chip *gc; gc = desc->gdev->chip; trace_gpio_value(desc_to_gpio(desc), 0, value); gc->set(gc, gpio_chip_hwgpio(desc), value); } /* * set multiple outputs on the same chip; * use the chip's set_multiple function if available; * otherwise set the outputs sequentially; * @chip: the GPIO chip we operate on * @mask: bit mask array; one bit per output; BITS_PER_LONG bits per word * defines which outputs are to be changed * @bits: bit value array; one bit per output; BITS_PER_LONG bits per word * defines the values the outputs specified by mask are to be set to */ static void gpio_chip_set_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { if (gc->set_multiple) { gc->set_multiple(gc, mask, bits); } else { unsigned int i; /* set outputs if the corresponding mask bit is set */ for_each_set_bit(i, mask, gc->ngpio) gc->set(gc, i, test_bit(i, bits)); } } int gpiod_set_array_value_complex(bool raw, bool can_sleep, unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { int i = 0; /* * Validate array_info against desc_array and its size. * It should immediately follow desc_array if both * have been obtained from the same gpiod_get_array() call. */ if (array_info && array_info->desc == desc_array && array_size <= array_info->size && (void *)array_info == desc_array + array_info->size) { if (!can_sleep) WARN_ON(array_info->chip->can_sleep); if (!raw && !bitmap_empty(array_info->invert_mask, array_size)) bitmap_xor(value_bitmap, value_bitmap, array_info->invert_mask, array_size); gpio_chip_set_multiple(array_info->chip, array_info->set_mask, value_bitmap); i = find_first_zero_bit(array_info->set_mask, array_size); if (i == array_size) return 0; } else { array_info = NULL; } while (i < array_size) { struct gpio_chip *gc = desc_array[i]->gdev->chip; DECLARE_BITMAP(fastpath_mask, FASTPATH_NGPIO); DECLARE_BITMAP(fastpath_bits, FASTPATH_NGPIO); unsigned long *mask, *bits; int count = 0; if (likely(gc->ngpio <= FASTPATH_NGPIO)) { mask = fastpath_mask; bits = fastpath_bits; } else { gfp_t flags = can_sleep ? GFP_KERNEL : GFP_ATOMIC; mask = bitmap_alloc(gc->ngpio, flags); if (!mask) return -ENOMEM; bits = bitmap_alloc(gc->ngpio, flags); if (!bits) { bitmap_free(mask); return -ENOMEM; } } bitmap_zero(mask, gc->ngpio); if (!can_sleep) WARN_ON(gc->can_sleep); do { struct gpio_desc *desc = desc_array[i]; int hwgpio = gpio_chip_hwgpio(desc); int value = test_bit(i, value_bitmap); /* * Pins applicable for fast input but not for * fast output processing may have been already * inverted inside the fast path, skip them. */ if (!raw && !(array_info && test_bit(i, array_info->invert_mask)) && test_bit(FLAG_ACTIVE_LOW, &desc->flags)) value = !value; trace_gpio_value(desc_to_gpio(desc), 0, value); /* * collect all normal outputs belonging to the same chip * open drain and open source outputs are set individually */ if (test_bit(FLAG_OPEN_DRAIN, &desc->flags) && !raw) { gpio_set_open_drain_value_commit(desc, value); } else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags) && !raw) { gpio_set_open_source_value_commit(desc, value); } else { __set_bit(hwgpio, mask); __assign_bit(hwgpio, bits, value); count++; } i++; if (array_info) i = find_next_zero_bit(array_info->set_mask, array_size, i); } while ((i < array_size) && (desc_array[i]->gdev->chip == gc)); /* push collected bits to outputs */ if (count != 0) gpio_chip_set_multiple(gc, mask, bits); if (mask != fastpath_mask) bitmap_free(mask); if (bits != fastpath_bits) bitmap_free(bits); } return 0; } /** * gpiod_set_raw_value() - assign a gpio's raw value * @desc: gpio whose value will be assigned * @value: value to assign * * Set the raw value of the GPIO, i.e. the value of its physical line without * regard for its ACTIVE_LOW status. * * This function can be called from contexts where we cannot sleep, and will * complain if the GPIO chip functions potentially sleep. */ void gpiod_set_raw_value(struct gpio_desc *desc, int value) { VALIDATE_DESC_VOID(desc); /* Should be using gpiod_set_raw_value_cansleep() */ WARN_ON(desc->gdev->chip->can_sleep); gpiod_set_raw_value_commit(desc, value); } EXPORT_SYMBOL_GPL(gpiod_set_raw_value); /** * gpiod_set_value_nocheck() - set a GPIO line value without checking * @desc: the descriptor to set the value on * @value: value to set * * This sets the value of a GPIO line backing a descriptor, applying * different semantic quirks like active low and open drain/source * handling. */ static void gpiod_set_value_nocheck(struct gpio_desc *desc, int value) { if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) value = !value; if (test_bit(FLAG_OPEN_DRAIN, &desc->flags)) gpio_set_open_drain_value_commit(desc, value); else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags)) gpio_set_open_source_value_commit(desc, value); else gpiod_set_raw_value_commit(desc, value); } /** * gpiod_set_value() - assign a gpio's value * @desc: gpio whose value will be assigned * @value: value to assign * * Set the logical value of the GPIO, i.e. taking its ACTIVE_LOW, * OPEN_DRAIN and OPEN_SOURCE flags into account. * * This function can be called from contexts where we cannot sleep, and will * complain if the GPIO chip functions potentially sleep. */ void gpiod_set_value(struct gpio_desc *desc, int value) { VALIDATE_DESC_VOID(desc); /* Should be using gpiod_set_value_cansleep() */ WARN_ON(desc->gdev->chip->can_sleep); gpiod_set_value_nocheck(desc, value); } EXPORT_SYMBOL_GPL(gpiod_set_value); /** * gpiod_set_raw_array_value() - assign values to an array of GPIOs * @array_size: number of elements in the descriptor array / value bitmap * @desc_array: array of GPIO descriptors whose values will be assigned * @array_info: information on applicability of fast bitmap processing path * @value_bitmap: bitmap of values to assign * * Set the raw values of the GPIOs, i.e. the values of the physical lines * without regard for their ACTIVE_LOW status. * * This function can be called from contexts where we cannot sleep, and will * complain if the GPIO chip functions potentially sleep. */ int gpiod_set_raw_array_value(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { if (!desc_array) return -EINVAL; return gpiod_set_array_value_complex(true, false, array_size, desc_array, array_info, value_bitmap); } EXPORT_SYMBOL_GPL(gpiod_set_raw_array_value); /** * gpiod_set_array_value() - assign values to an array of GPIOs * @array_size: number of elements in the descriptor array / value bitmap * @desc_array: array of GPIO descriptors whose values will be assigned * @array_info: information on applicability of fast bitmap processing path * @value_bitmap: bitmap of values to assign * * Set the logical values of the GPIOs, i.e. taking their ACTIVE_LOW status * into account. * * This function can be called from contexts where we cannot sleep, and will * complain if the GPIO chip functions potentially sleep. */ int gpiod_set_array_value(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { if (!desc_array) return -EINVAL; return gpiod_set_array_value_complex(false, false, array_size, desc_array, array_info, value_bitmap); } EXPORT_SYMBOL_GPL(gpiod_set_array_value); /** * gpiod_cansleep() - report whether gpio value access may sleep * @desc: gpio to check * */ int gpiod_cansleep(const struct gpio_desc *desc) { VALIDATE_DESC(desc); return desc->gdev->chip->can_sleep; } EXPORT_SYMBOL_GPL(gpiod_cansleep); /** * gpiod_set_consumer_name() - set the consumer name for the descriptor * @desc: gpio to set the consumer name on * @name: the new consumer name */ int gpiod_set_consumer_name(struct gpio_desc *desc, const char *name) { VALIDATE_DESC(desc); if (name) { name = kstrdup_const(name, GFP_KERNEL); if (!name) return -ENOMEM; } kfree_const(desc->label); desc_set_label(desc, name); return 0; } EXPORT_SYMBOL_GPL(gpiod_set_consumer_name); /** * gpiod_to_irq() - return the IRQ corresponding to a GPIO * @desc: gpio whose IRQ will be returned (already requested) * * Return the IRQ corresponding to the passed GPIO, or an error code in case of * error. */ int gpiod_to_irq(const struct gpio_desc *desc) { struct gpio_chip *gc; int offset; /* * Cannot VALIDATE_DESC() here as gpiod_to_irq() consumer semantics * requires this function to not return zero on an invalid descriptor * but rather a negative error number. */ if (!desc || IS_ERR(desc) || !desc->gdev || !desc->gdev->chip) return -EINVAL; gc = desc->gdev->chip; offset = gpio_chip_hwgpio(desc); if (gc->to_irq) { int retirq = gc->to_irq(gc, offset); /* Zero means NO_IRQ */ if (!retirq) return -ENXIO; return retirq; } #ifdef CONFIG_GPIOLIB_IRQCHIP if (gc->irq.chip) { /* * Avoid race condition with other code, which tries to lookup * an IRQ before the irqchip has been properly registered, * i.e. while gpiochip is still being brought up. */ return -EPROBE_DEFER; } #endif return -ENXIO; } EXPORT_SYMBOL_GPL(gpiod_to_irq); /** * gpiochip_lock_as_irq() - lock a GPIO to be used as IRQ * @gc: the chip the GPIO to lock belongs to * @offset: the offset of the GPIO to lock as IRQ * * This is used directly by GPIO drivers that want to lock down * a certain GPIO line to be used for IRQs. */ int gpiochip_lock_as_irq(struct gpio_chip *gc, unsigned int offset) { struct gpio_desc *desc; desc = gpiochip_get_desc(gc, offset); if (IS_ERR(desc)) return PTR_ERR(desc); /* * If it's fast: flush the direction setting if something changed * behind our back */ if (!gc->can_sleep && gc->get_direction) { int dir = gpiod_get_direction(desc); if (dir < 0) { chip_err(gc, "%s: cannot get GPIO direction\n", __func__); return dir; } } /* To be valid for IRQ the line needs to be input or open drain */ if (test_bit(FLAG_IS_OUT, &desc->flags) && !test_bit(FLAG_OPEN_DRAIN, &desc->flags)) { chip_err(gc, "%s: tried to flag a GPIO set as output for IRQ\n", __func__); return -EIO; } set_bit(FLAG_USED_AS_IRQ, &desc->flags); set_bit(FLAG_IRQ_IS_ENABLED, &desc->flags); /* * If the consumer has not set up a label (such as when the * IRQ is referenced from .to_irq()) we set up a label here * so it is clear this is used as an interrupt. */ if (!desc->label) desc_set_label(desc, "interrupt"); return 0; } EXPORT_SYMBOL_GPL(gpiochip_lock_as_irq); /** * gpiochip_unlock_as_irq() - unlock a GPIO used as IRQ * @gc: the chip the GPIO to lock belongs to * @offset: the offset of the GPIO to lock as IRQ * * This is used directly by GPIO drivers that want to indicate * that a certain GPIO is no longer used exclusively for IRQ. */ void gpiochip_unlock_as_irq(struct gpio_chip *gc, unsigned int offset) { struct gpio_desc *desc; desc = gpiochip_get_desc(gc, offset); if (IS_ERR(desc)) return; clear_bit(FLAG_USED_AS_IRQ, &desc->flags); clear_bit(FLAG_IRQ_IS_ENABLED, &desc->flags); /* If we only had this marking, erase it */ if (desc->label && !strcmp(desc->label, "interrupt")) desc_set_label(desc, NULL); } EXPORT_SYMBOL_GPL(gpiochip_unlock_as_irq); void gpiochip_disable_irq(struct gpio_chip *gc, unsigned int offset) { struct gpio_desc *desc = gpiochip_get_desc(gc, offset); if (!IS_ERR(desc) && !WARN_ON(!test_bit(FLAG_USED_AS_IRQ, &desc->flags))) clear_bit(FLAG_IRQ_IS_ENABLED, &desc->flags); } EXPORT_SYMBOL_GPL(gpiochip_disable_irq); void gpiochip_enable_irq(struct gpio_chip *gc, unsigned int offset) { struct gpio_desc *desc = gpiochip_get_desc(gc, offset); if (!IS_ERR(desc) && !WARN_ON(!test_bit(FLAG_USED_AS_IRQ, &desc->flags))) { /* * We must not be output when using IRQ UNLESS we are * open drain. */ WARN_ON(test_bit(FLAG_IS_OUT, &desc->flags) && !test_bit(FLAG_OPEN_DRAIN, &desc->flags)); set_bit(FLAG_IRQ_IS_ENABLED, &desc->flags); } } EXPORT_SYMBOL_GPL(gpiochip_enable_irq); bool gpiochip_line_is_irq(struct gpio_chip *gc, unsigned int offset) { if (offset >= gc->ngpio) return false; return test_bit(FLAG_USED_AS_IRQ, &gc->gpiodev->descs[offset].flags); } EXPORT_SYMBOL_GPL(gpiochip_line_is_irq); int gpiochip_reqres_irq(struct gpio_chip *gc, unsigned int offset) { int ret; if (!try_module_get(gc->gpiodev->owner)) return -ENODEV; ret = gpiochip_lock_as_irq(gc, offset); if (ret) { chip_err(gc, "unable to lock HW IRQ %u for IRQ\n", offset); module_put(gc->gpiodev->owner); return ret; } return 0; } EXPORT_SYMBOL_GPL(gpiochip_reqres_irq); void gpiochip_relres_irq(struct gpio_chip *gc, unsigned int offset) { gpiochip_unlock_as_irq(gc, offset); module_put(gc->gpiodev->owner); } EXPORT_SYMBOL_GPL(gpiochip_relres_irq); bool gpiochip_line_is_open_drain(struct gpio_chip *gc, unsigned int offset) { if (offset >= gc->ngpio) return false; return test_bit(FLAG_OPEN_DRAIN, &gc->gpiodev->descs[offset].flags); } EXPORT_SYMBOL_GPL(gpiochip_line_is_open_drain); bool gpiochip_line_is_open_source(struct gpio_chip *gc, unsigned int offset) { if (offset >= gc->ngpio) return false; return test_bit(FLAG_OPEN_SOURCE, &gc->gpiodev->descs[offset].flags); } EXPORT_SYMBOL_GPL(gpiochip_line_is_open_source); bool gpiochip_line_is_persistent(struct gpio_chip *gc, unsigned int offset) { if (offset >= gc->ngpio) return false; return !test_bit(FLAG_TRANSITORY, &gc->gpiodev->descs[offset].flags); } EXPORT_SYMBOL_GPL(gpiochip_line_is_persistent); /** * gpiod_get_raw_value_cansleep() - return a gpio's raw value * @desc: gpio whose value will be returned * * Return the GPIO's raw value, i.e. the value of the physical line disregarding * its ACTIVE_LOW status, or negative errno on failure. * * This function is to be called from contexts that can sleep. */ int gpiod_get_raw_value_cansleep(const struct gpio_desc *desc) { might_sleep(); VALIDATE_DESC(desc); return gpiod_get_raw_value_commit(desc); } EXPORT_SYMBOL_GPL(gpiod_get_raw_value_cansleep); /** * gpiod_get_value_cansleep() - return a gpio's value * @desc: gpio whose value will be returned * * Return the GPIO's logical value, i.e. taking the ACTIVE_LOW status into * account, or negative errno on failure. * * This function is to be called from contexts that can sleep. */ int gpiod_get_value_cansleep(const struct gpio_desc *desc) { int value; might_sleep(); VALIDATE_DESC(desc); value = gpiod_get_raw_value_commit(desc); if (value < 0) return value; if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) value = !value; return value; } EXPORT_SYMBOL_GPL(gpiod_get_value_cansleep); /** * gpiod_get_raw_array_value_cansleep() - read raw values from an array of GPIOs * @array_size: number of elements in the descriptor array / value bitmap * @desc_array: array of GPIO descriptors whose values will be read * @array_info: information on applicability of fast bitmap processing path * @value_bitmap: bitmap to store the read values * * Read the raw values of the GPIOs, i.e. the values of the physical lines * without regard for their ACTIVE_LOW status. Return 0 in case of success, * else an error code. * * This function is to be called from contexts that can sleep. */ int gpiod_get_raw_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { might_sleep(); if (!desc_array) return -EINVAL; return gpiod_get_array_value_complex(true, true, array_size, desc_array, array_info, value_bitmap); } EXPORT_SYMBOL_GPL(gpiod_get_raw_array_value_cansleep); /** * gpiod_get_array_value_cansleep() - read values from an array of GPIOs * @array_size: number of elements in the descriptor array / value bitmap * @desc_array: array of GPIO descriptors whose values will be read * @array_info: information on applicability of fast bitmap processing path * @value_bitmap: bitmap to store the read values * * Read the logical values of the GPIOs, i.e. taking their ACTIVE_LOW status * into account. Return 0 in case of success, else an error code. * * This function is to be called from contexts that can sleep. */ int gpiod_get_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { might_sleep(); if (!desc_array) return -EINVAL; return gpiod_get_array_value_complex(false, true, array_size, desc_array, array_info, value_bitmap); } EXPORT_SYMBOL_GPL(gpiod_get_array_value_cansleep); /** * gpiod_set_raw_value_cansleep() - assign a gpio's raw value * @desc: gpio whose value will be assigned * @value: value to assign * * Set the raw value of the GPIO, i.e. the value of its physical line without * regard for its ACTIVE_LOW status. * * This function is to be called from contexts that can sleep. */ void gpiod_set_raw_value_cansleep(struct gpio_desc *desc, int value) { might_sleep(); VALIDATE_DESC_VOID(desc); gpiod_set_raw_value_commit(desc, value); } EXPORT_SYMBOL_GPL(gpiod_set_raw_value_cansleep); /** * gpiod_set_value_cansleep() - assign a gpio's value * @desc: gpio whose value will be assigned * @value: value to assign * * Set the logical value of the GPIO, i.e. taking its ACTIVE_LOW status into * account * * This function is to be called from contexts that can sleep. */ void gpiod_set_value_cansleep(struct gpio_desc *desc, int value) { might_sleep(); VALIDATE_DESC_VOID(desc); gpiod_set_value_nocheck(desc, value); } EXPORT_SYMBOL_GPL(gpiod_set_value_cansleep); /** * gpiod_set_raw_array_value_cansleep() - assign values to an array of GPIOs * @array_size: number of elements in the descriptor array / value bitmap * @desc_array: array of GPIO descriptors whose values will be assigned * @array_info: information on applicability of fast bitmap processing path * @value_bitmap: bitmap of values to assign * * Set the raw values of the GPIOs, i.e. the values of the physical lines * without regard for their ACTIVE_LOW status. * * This function is to be called from contexts that can sleep. */ int gpiod_set_raw_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { might_sleep(); if (!desc_array) return -EINVAL; return gpiod_set_array_value_complex(true, true, array_size, desc_array, array_info, value_bitmap); } EXPORT_SYMBOL_GPL(gpiod_set_raw_array_value_cansleep); /** * gpiod_add_lookup_tables() - register GPIO device consumers * @tables: list of tables of consumers to register * @n: number of tables in the list */ void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n) { unsigned int i; mutex_lock(&gpio_lookup_lock); for (i = 0; i < n; i++) list_add_tail(&tables[i]->list, &gpio_lookup_list); mutex_unlock(&gpio_lookup_lock); } /** * gpiod_set_array_value_cansleep() - assign values to an array of GPIOs * @array_size: number of elements in the descriptor array / value bitmap * @desc_array: array of GPIO descriptors whose values will be assigned * @array_info: information on applicability of fast bitmap processing path * @value_bitmap: bitmap of values to assign * * Set the logical values of the GPIOs, i.e. taking their ACTIVE_LOW status * into account. * * This function is to be called from contexts that can sleep. */ int gpiod_set_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { might_sleep(); if (!desc_array) return -EINVAL; return gpiod_set_array_value_complex(false, true, array_size, desc_array, array_info, value_bitmap); } EXPORT_SYMBOL_GPL(gpiod_set_array_value_cansleep); void gpiod_line_state_notify(struct gpio_desc *desc, unsigned long action) { blocking_notifier_call_chain(&desc->gdev->line_state_notifier, action, desc); } /** * gpiod_add_lookup_table() - register GPIO device consumers * @table: table of consumers to register */ void gpiod_add_lookup_table(struct gpiod_lookup_table *table) { gpiod_add_lookup_tables(&table, 1); } EXPORT_SYMBOL_GPL(gpiod_add_lookup_table); /** * gpiod_remove_lookup_table() - unregister GPIO device consumers * @table: table of consumers to unregister */ void gpiod_remove_lookup_table(struct gpiod_lookup_table *table) { /* Nothing to remove */ if (!table) return; mutex_lock(&gpio_lookup_lock); list_del(&table->list); mutex_unlock(&gpio_lookup_lock); } EXPORT_SYMBOL_GPL(gpiod_remove_lookup_table); /** * gpiod_add_hogs() - register a set of GPIO hogs from machine code * @hogs: table of gpio hog entries with a zeroed sentinel at the end */ void gpiod_add_hogs(struct gpiod_hog *hogs) { struct gpiod_hog *hog; mutex_lock(&gpio_machine_hogs_mutex); for (hog = &hogs[0]; hog->chip_label; hog++) { list_add_tail(&hog->list, &gpio_machine_hogs); /* * The chip may have been registered earlier, so check if it * exists and, if so, try to hog the line now. */ struct gpio_device *gdev __free(gpio_device_put) = gpio_device_find_by_label(hog->chip_label); if (gdev) gpiochip_machine_hog(gpio_device_get_chip(gdev), hog); } mutex_unlock(&gpio_machine_hogs_mutex); } EXPORT_SYMBOL_GPL(gpiod_add_hogs); void gpiod_remove_hogs(struct gpiod_hog *hogs) { struct gpiod_hog *hog; mutex_lock(&gpio_machine_hogs_mutex); for (hog = &hogs[0]; hog->chip_label; hog++) list_del(&hog->list); mutex_unlock(&gpio_machine_hogs_mutex); } EXPORT_SYMBOL_GPL(gpiod_remove_hogs); static struct gpiod_lookup_table *gpiod_find_lookup_table(struct device *dev) { const char *dev_id = dev ? dev_name(dev) : NULL; struct gpiod_lookup_table *table; list_for_each_entry(table, &gpio_lookup_list, list) { if (table->dev_id && dev_id) { /* * Valid strings on both ends, must be identical to have * a match */ if (!strcmp(table->dev_id, dev_id)) return table; } else { /* * One of the pointers is NULL, so both must be to have * a match */ if (dev_id == table->dev_id) return table; } } return NULL; } static struct gpio_desc *gpiod_find(struct device *dev, const char *con_id, unsigned int idx, unsigned long *flags) { struct gpio_desc *desc = ERR_PTR(-ENOENT); struct gpiod_lookup_table *table; struct gpiod_lookup *p; struct gpio_chip *gc; guard(mutex)(&gpio_lookup_lock); table = gpiod_find_lookup_table(dev); if (!table) return desc; for (p = &table->table[0]; p->key; p++) { /* idx must always match exactly */ if (p->idx != idx) continue; /* If the lookup entry has a con_id, require exact match */ if (p->con_id && (!con_id || strcmp(p->con_id, con_id))) continue; if (p->chip_hwnum == U16_MAX) { desc = gpio_name_to_desc(p->key); if (desc) { *flags = p->flags; return desc; } dev_warn(dev, "cannot find GPIO line %s, deferring\n", p->key); return ERR_PTR(-EPROBE_DEFER); } struct gpio_device *gdev __free(gpio_device_put) = gpio_device_find_by_label(p->key); if (!gdev) { /* * As the lookup table indicates a chip with * p->key should exist, assume it may * still appear later and let the interested * consumer be probed again or let the Deferred * Probe infrastructure handle the error. */ dev_warn(dev, "cannot find GPIO chip %s, deferring\n", p->key); return ERR_PTR(-EPROBE_DEFER); } gc = gpio_device_get_chip(gdev); if (gc->ngpio <= p->chip_hwnum) { dev_err(dev, "requested GPIO %u (%u) is out of range [0..%u] for chip %s\n", idx, p->chip_hwnum, gc->ngpio - 1, gc->label); return ERR_PTR(-EINVAL); } desc = gpio_device_get_desc(gdev, p->chip_hwnum); *flags = p->flags; return desc; } return desc; } static int platform_gpio_count(struct device *dev, const char *con_id) { struct gpiod_lookup_table *table; struct gpiod_lookup *p; unsigned int count = 0; scoped_guard(mutex, &gpio_lookup_lock) { table = gpiod_find_lookup_table(dev); if (!table) return -ENOENT; for (p = &table->table[0]; p->key; p++) { if ((con_id && p->con_id && !strcmp(con_id, p->con_id)) || (!con_id && !p->con_id)) count++; } } if (!count) return -ENOENT; return count; } static struct gpio_desc *gpiod_find_by_fwnode(struct fwnode_handle *fwnode, struct device *consumer, const char *con_id, unsigned int idx, enum gpiod_flags *flags, unsigned long *lookupflags) { struct gpio_desc *desc = ERR_PTR(-ENOENT); if (is_of_node(fwnode)) { dev_dbg(consumer, "using DT '%pfw' for '%s' GPIO lookup\n", fwnode, con_id); desc = of_find_gpio(to_of_node(fwnode), con_id, idx, lookupflags); } else if (is_acpi_node(fwnode)) { dev_dbg(consumer, "using ACPI '%pfw' for '%s' GPIO lookup\n", fwnode, con_id); desc = acpi_find_gpio(fwnode, con_id, idx, flags, lookupflags); } else if (is_software_node(fwnode)) { dev_dbg(consumer, "using swnode '%pfw' for '%s' GPIO lookup\n", fwnode, con_id); desc = swnode_find_gpio(fwnode, con_id, idx, lookupflags); } return desc; } static struct gpio_desc *gpiod_find_and_request(struct device *consumer, struct fwnode_handle *fwnode, const char *con_id, unsigned int idx, enum gpiod_flags flags, const char *label, bool platform_lookup_allowed) { unsigned long lookupflags = GPIO_LOOKUP_FLAGS_DEFAULT; struct gpio_desc *desc; int ret; desc = gpiod_find_by_fwnode(fwnode, consumer, con_id, idx, &flags, &lookupflags); if (gpiod_not_found(desc) && platform_lookup_allowed) { /* * Either we are not using DT or ACPI, or their lookup did not * return a result. In that case, use platform lookup as a * fallback. */ dev_dbg(consumer, "using lookup tables for GPIO lookup\n"); desc = gpiod_find(consumer, con_id, idx, &lookupflags); } if (IS_ERR(desc)) { dev_dbg(consumer, "No GPIO consumer %s found\n", con_id); return desc; } /* * If a connection label was passed use that, else attempt to use * the device name as label */ ret = gpiod_request(desc, label); if (ret) { if (!(ret == -EBUSY && flags & GPIOD_FLAGS_BIT_NONEXCLUSIVE)) return ERR_PTR(ret); /* * This happens when there are several consumers for * the same GPIO line: we just return here without * further initialization. It is a bit of a hack. * This is necessary to support fixed regulators. * * FIXME: Make this more sane and safe. */ dev_info(consumer, "nonexclusive access to GPIO for %s\n", con_id); return desc; } ret = gpiod_configure_flags(desc, con_id, lookupflags, flags); if (ret < 0) { dev_dbg(consumer, "setup of GPIO %s failed\n", con_id); gpiod_put(desc); return ERR_PTR(ret); } gpiod_line_state_notify(desc, GPIOLINE_CHANGED_REQUESTED); return desc; } /** * fwnode_gpiod_get_index - obtain a GPIO from firmware node * @fwnode: handle of the firmware node * @con_id: function within the GPIO consumer * @index: index of the GPIO to obtain for the consumer * @flags: GPIO initialization flags * @label: label to attach to the requested GPIO * * This function can be used for drivers that get their configuration * from opaque firmware. * * The function properly finds the corresponding GPIO using whatever is the * underlying firmware interface and then makes sure that the GPIO * descriptor is requested before it is returned to the caller. * * Returns: * On successful request the GPIO pin is configured in accordance with * provided @flags. * * In case of error an ERR_PTR() is returned. */ struct gpio_desc *fwnode_gpiod_get_index(struct fwnode_handle *fwnode, const char *con_id, int index, enum gpiod_flags flags, const char *label) { return gpiod_find_and_request(NULL, fwnode, con_id, index, flags, label, false); } EXPORT_SYMBOL_GPL(fwnode_gpiod_get_index); /** * gpiod_count - return the number of GPIOs associated with a device / function * or -ENOENT if no GPIO has been assigned to the requested function * @dev: GPIO consumer, can be NULL for system-global GPIOs * @con_id: function within the GPIO consumer */ int gpiod_count(struct device *dev, const char *con_id) { const struct fwnode_handle *fwnode = dev ? dev_fwnode(dev) : NULL; int count = -ENOENT; if (is_of_node(fwnode)) count = of_gpio_get_count(dev, con_id); else if (is_acpi_node(fwnode)) count = acpi_gpio_count(dev, con_id); else if (is_software_node(fwnode)) count = swnode_gpio_count(fwnode, con_id); if (count < 0) count = platform_gpio_count(dev, con_id); return count; } EXPORT_SYMBOL_GPL(gpiod_count); /** * gpiod_get - obtain a GPIO for a given GPIO function * @dev: GPIO consumer, can be NULL for system-global GPIOs * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * Return the GPIO descriptor corresponding to the function con_id of device * dev, -ENOENT if no GPIO has been assigned to the requested function, or * another IS_ERR() code if an error occurred while trying to acquire the GPIO. */ struct gpio_desc *__must_check gpiod_get(struct device *dev, const char *con_id, enum gpiod_flags flags) { return gpiod_get_index(dev, con_id, 0, flags); } EXPORT_SYMBOL_GPL(gpiod_get); /** * gpiod_get_optional - obtain an optional GPIO for a given GPIO function * @dev: GPIO consumer, can be NULL for system-global GPIOs * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * This is equivalent to gpiod_get(), except that when no GPIO was assigned to * the requested function it will return NULL. This is convenient for drivers * that need to handle optional GPIOs. */ struct gpio_desc *__must_check gpiod_get_optional(struct device *dev, const char *con_id, enum gpiod_flags flags) { return gpiod_get_index_optional(dev, con_id, 0, flags); } EXPORT_SYMBOL_GPL(gpiod_get_optional); /** * gpiod_configure_flags - helper function to configure a given GPIO * @desc: gpio whose value will be assigned * @con_id: function within the GPIO consumer * @lflags: bitmask of gpio_lookup_flags GPIO_* values - returned from * of_find_gpio() or of_get_gpio_hog() * @dflags: gpiod_flags - optional GPIO initialization flags * * Return 0 on success, -ENOENT if no GPIO has been assigned to the * requested function and/or index, or another IS_ERR() code if an error * occurred while trying to acquire the GPIO. */ int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id, unsigned long lflags, enum gpiod_flags dflags) { int ret; if (lflags & GPIO_ACTIVE_LOW) set_bit(FLAG_ACTIVE_LOW, &desc->flags); if (lflags & GPIO_OPEN_DRAIN) set_bit(FLAG_OPEN_DRAIN, &desc->flags); else if (dflags & GPIOD_FLAGS_BIT_OPEN_DRAIN) { /* * This enforces open drain mode from the consumer side. * This is necessary for some busses like I2C, but the lookup * should *REALLY* have specified them as open drain in the * first place, so print a little warning here. */ set_bit(FLAG_OPEN_DRAIN, &desc->flags); gpiod_warn(desc, "enforced open drain please flag it properly in DT/ACPI DSDT/board file\n"); } if (lflags & GPIO_OPEN_SOURCE) set_bit(FLAG_OPEN_SOURCE, &desc->flags); if (((lflags & GPIO_PULL_UP) && (lflags & GPIO_PULL_DOWN)) || ((lflags & GPIO_PULL_UP) && (lflags & GPIO_PULL_DISABLE)) || ((lflags & GPIO_PULL_DOWN) && (lflags & GPIO_PULL_DISABLE))) { gpiod_err(desc, "multiple pull-up, pull-down or pull-disable enabled, invalid configuration\n"); return -EINVAL; } if (lflags & GPIO_PULL_UP) set_bit(FLAG_PULL_UP, &desc->flags); else if (lflags & GPIO_PULL_DOWN) set_bit(FLAG_PULL_DOWN, &desc->flags); else if (lflags & GPIO_PULL_DISABLE) set_bit(FLAG_BIAS_DISABLE, &desc->flags); ret = gpiod_set_transitory(desc, (lflags & GPIO_TRANSITORY)); if (ret < 0) return ret; /* No particular flag request, return here... */ if (!(dflags & GPIOD_FLAGS_BIT_DIR_SET)) { gpiod_dbg(desc, "no flags found for %s\n", con_id); return 0; } /* Process flags */ if (dflags & GPIOD_FLAGS_BIT_DIR_OUT) ret = gpiod_direction_output(desc, !!(dflags & GPIOD_FLAGS_BIT_DIR_VAL)); else ret = gpiod_direction_input(desc); return ret; } /** * gpiod_get_index - obtain a GPIO from a multi-index GPIO function * @dev: GPIO consumer, can be NULL for system-global GPIOs * @con_id: function within the GPIO consumer * @idx: index of the GPIO to obtain in the consumer * @flags: optional GPIO initialization flags * * This variant of gpiod_get() allows to access GPIOs other than the first * defined one for functions that define several GPIOs. * * Return a valid GPIO descriptor, -ENOENT if no GPIO has been assigned to the * requested function and/or index, or another IS_ERR() code if an error * occurred while trying to acquire the GPIO. */ struct gpio_desc *__must_check gpiod_get_index(struct device *dev, const char *con_id, unsigned int idx, enum gpiod_flags flags) { struct fwnode_handle *fwnode = dev ? dev_fwnode(dev) : NULL; const char *devname = dev ? dev_name(dev) : "?"; const char *label = con_id ?: devname; return gpiod_find_and_request(dev, fwnode, con_id, idx, flags, label, true); } EXPORT_SYMBOL_GPL(gpiod_get_index); /** * gpiod_get_index_optional - obtain an optional GPIO from a multi-index GPIO * function * @dev: GPIO consumer, can be NULL for system-global GPIOs * @con_id: function within the GPIO consumer * @index: index of the GPIO to obtain in the consumer * @flags: optional GPIO initialization flags * * This is equivalent to gpiod_get_index(), except that when no GPIO with the * specified index was assigned to the requested function it will return NULL. * This is convenient for drivers that need to handle optional GPIOs. */ struct gpio_desc *__must_check gpiod_get_index_optional(struct device *dev, const char *con_id, unsigned int index, enum gpiod_flags flags) { struct gpio_desc *desc; desc = gpiod_get_index(dev, con_id, index, flags); if (gpiod_not_found(desc)) return NULL; return desc; } EXPORT_SYMBOL_GPL(gpiod_get_index_optional); /** * gpiod_hog - Hog the specified GPIO desc given the provided flags * @desc: gpio whose value will be assigned * @name: gpio line name * @lflags: bitmask of gpio_lookup_flags GPIO_* values - returned from * of_find_gpio() or of_get_gpio_hog() * @dflags: gpiod_flags - optional GPIO initialization flags */ int gpiod_hog(struct gpio_desc *desc, const char *name, unsigned long lflags, enum gpiod_flags dflags) { struct gpio_chip *gc; struct gpio_desc *local_desc; int hwnum; int ret; gc = gpiod_to_chip(desc); hwnum = gpio_chip_hwgpio(desc); local_desc = gpiochip_request_own_desc(gc, hwnum, name, lflags, dflags); if (IS_ERR(local_desc)) { ret = PTR_ERR(local_desc); pr_err("requesting hog GPIO %s (chip %s, offset %d) failed, %d\n", name, gc->label, hwnum, ret); return ret; } /* Mark GPIO as hogged so it can be identified and removed later */ set_bit(FLAG_IS_HOGGED, &desc->flags); gpiod_dbg(desc, "hogged as %s%s\n", (dflags & GPIOD_FLAGS_BIT_DIR_OUT) ? "output" : "input", (dflags & GPIOD_FLAGS_BIT_DIR_OUT) ? (dflags & GPIOD_FLAGS_BIT_DIR_VAL) ? "/high" : "/low" : ""); return 0; } /** * gpiochip_free_hogs - Scan gpio-controller chip and release GPIO hog * @gc: gpio chip to act on */ static void gpiochip_free_hogs(struct gpio_chip *gc) { struct gpio_desc *desc; for_each_gpio_desc_with_flag(gc, desc, FLAG_IS_HOGGED) gpiochip_free_own_desc(desc); } /** * gpiod_get_array - obtain multiple GPIOs from a multi-index GPIO function * @dev: GPIO consumer, can be NULL for system-global GPIOs * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * This function acquires all the GPIOs defined under a given function. * * Return a struct gpio_descs containing an array of descriptors, -ENOENT if * no GPIO has been assigned to the requested function, or another IS_ERR() * code if an error occurred while trying to acquire the GPIOs. */ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, const char *con_id, enum gpiod_flags flags) { struct gpio_desc *desc; struct gpio_descs *descs; struct gpio_array *array_info = NULL; struct gpio_chip *gc; int count, bitmap_size; size_t descs_size; count = gpiod_count(dev, con_id); if (count < 0) return ERR_PTR(count); descs_size = struct_size(descs, desc, count); descs = kzalloc(descs_size, GFP_KERNEL); if (!descs) return ERR_PTR(-ENOMEM); for (descs->ndescs = 0; descs->ndescs < count; descs->ndescs++) { desc = gpiod_get_index(dev, con_id, descs->ndescs, flags); if (IS_ERR(desc)) { gpiod_put_array(descs); return ERR_CAST(desc); } descs->desc[descs->ndescs] = desc; gc = gpiod_to_chip(desc); /* * If pin hardware number of array member 0 is also 0, select * its chip as a candidate for fast bitmap processing path. */ if (descs->ndescs == 0 && gpio_chip_hwgpio(desc) == 0) { struct gpio_descs *array; bitmap_size = BITS_TO_LONGS(gc->ngpio > count ? gc->ngpio : count); array = krealloc(descs, descs_size + struct_size(array_info, invert_mask, 3 * bitmap_size), GFP_KERNEL | __GFP_ZERO); if (!array) { gpiod_put_array(descs); return ERR_PTR(-ENOMEM); } descs = array; array_info = (void *)descs + descs_size; array_info->get_mask = array_info->invert_mask + bitmap_size; array_info->set_mask = array_info->get_mask + bitmap_size; array_info->desc = descs->desc; array_info->size = count; array_info->chip = gc; bitmap_set(array_info->get_mask, descs->ndescs, count - descs->ndescs); bitmap_set(array_info->set_mask, descs->ndescs, count - descs->ndescs); descs->info = array_info; } /* If there is no cache for fast bitmap processing path, continue */ if (!array_info) continue; /* Unmark array members which don't belong to the 'fast' chip */ if (array_info->chip != gc) { __clear_bit(descs->ndescs, array_info->get_mask); __clear_bit(descs->ndescs, array_info->set_mask); } /* * Detect array members which belong to the 'fast' chip * but their pins are not in hardware order. */ else if (gpio_chip_hwgpio(desc) != descs->ndescs) { /* * Don't use fast path if all array members processed so * far belong to the same chip as this one but its pin * hardware number is different from its array index. */ if (bitmap_full(array_info->get_mask, descs->ndescs)) { array_info = NULL; } else { __clear_bit(descs->ndescs, array_info->get_mask); __clear_bit(descs->ndescs, array_info->set_mask); } } else { /* Exclude open drain or open source from fast output */ if (gpiochip_line_is_open_drain(gc, descs->ndescs) || gpiochip_line_is_open_source(gc, descs->ndescs)) __clear_bit(descs->ndescs, array_info->set_mask); /* Identify 'fast' pins which require invertion */ if (gpiod_is_active_low(desc)) __set_bit(descs->ndescs, array_info->invert_mask); } } if (array_info) dev_dbg(dev, "GPIO array info: chip=%s, size=%d, get_mask=%lx, set_mask=%lx, invert_mask=%lx\n", array_info->chip->label, array_info->size, *array_info->get_mask, *array_info->set_mask, *array_info->invert_mask); return descs; } EXPORT_SYMBOL_GPL(gpiod_get_array); /** * gpiod_get_array_optional - obtain multiple GPIOs from a multi-index GPIO * function * @dev: GPIO consumer, can be NULL for system-global GPIOs * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * This is equivalent to gpiod_get_array(), except that when no GPIO was * assigned to the requested function it will return NULL. */ struct gpio_descs *__must_check gpiod_get_array_optional(struct device *dev, const char *con_id, enum gpiod_flags flags) { struct gpio_descs *descs; descs = gpiod_get_array(dev, con_id, flags); if (gpiod_not_found(descs)) return NULL; return descs; } EXPORT_SYMBOL_GPL(gpiod_get_array_optional); /** * gpiod_put - dispose of a GPIO descriptor * @desc: GPIO descriptor to dispose of * * No descriptor can be used after gpiod_put() has been called on it. */ void gpiod_put(struct gpio_desc *desc) { if (desc) gpiod_free(desc); } EXPORT_SYMBOL_GPL(gpiod_put); /** * gpiod_put_array - dispose of multiple GPIO descriptors * @descs: struct gpio_descs containing an array of descriptors */ void gpiod_put_array(struct gpio_descs *descs) { unsigned int i; for (i = 0; i < descs->ndescs; i++) gpiod_put(descs->desc[i]); kfree(descs); } EXPORT_SYMBOL_GPL(gpiod_put_array); static int gpio_stub_drv_probe(struct device *dev) { /* * The DT node of some GPIO chips have a "compatible" property, but * never have a struct device added and probed by a driver to register * the GPIO chip with gpiolib. In such cases, fw_devlink=on will cause * the consumers of the GPIO chip to get probe deferred forever because * they will be waiting for a device associated with the GPIO chip * firmware node to get added and bound to a driver. * * To allow these consumers to probe, we associate the struct * gpio_device of the GPIO chip with the firmware node and then simply * bind it to this stub driver. */ return 0; } static struct device_driver gpio_stub_drv = { .name = "gpio_stub_drv", .bus = &gpio_bus_type, .probe = gpio_stub_drv_probe, }; static int __init gpiolib_dev_init(void) { int ret; /* Register GPIO sysfs bus */ ret = bus_register(&gpio_bus_type); if (ret < 0) { pr_err("gpiolib: could not register GPIO bus type\n"); return ret; } ret = driver_register(&gpio_stub_drv); if (ret < 0) { pr_err("gpiolib: could not register GPIO stub driver\n"); bus_unregister(&gpio_bus_type); return ret; } ret = alloc_chrdev_region(&gpio_devt, 0, GPIO_DEV_MAX, GPIOCHIP_NAME); if (ret < 0) { pr_err("gpiolib: failed to allocate char dev region\n"); driver_unregister(&gpio_stub_drv); bus_unregister(&gpio_bus_type); return ret; } gpiolib_initialized = true; gpiochip_setup_devs(); #if IS_ENABLED(CONFIG_OF_DYNAMIC) && IS_ENABLED(CONFIG_OF_GPIO) WARN_ON(of_reconfig_notifier_register(&gpio_of_notifier)); #endif /* CONFIG_OF_DYNAMIC && CONFIG_OF_GPIO */ return ret; } core_initcall(gpiolib_dev_init); #ifdef CONFIG_DEBUG_FS static void gpiolib_dbg_show(struct seq_file *s, struct gpio_device *gdev) { struct gpio_chip *gc = gdev->chip; bool active_low, is_irq, is_out; unsigned int gpio = gdev->base; struct gpio_desc *desc; int value; for_each_gpio_desc(gc, desc) { if (test_bit(FLAG_REQUESTED, &desc->flags)) { gpiod_get_direction(desc); is_out = test_bit(FLAG_IS_OUT, &desc->flags); value = gpio_chip_get_value(gc, desc); is_irq = test_bit(FLAG_USED_AS_IRQ, &desc->flags); active_low = test_bit(FLAG_ACTIVE_LOW, &desc->flags); seq_printf(s, " gpio-%-3d (%-20.20s|%-20.20s) %s %s %s%s\n", gpio, desc->name ?: "", desc->label, is_out ? "out" : "in ", value >= 0 ? (value ? "hi" : "lo") : "? ", is_irq ? "IRQ " : "", active_low ? "ACTIVE LOW" : ""); } else if (desc->name) { seq_printf(s, " gpio-%-3d (%-20.20s)\n", gpio, desc->name); } gpio++; } } static void *gpiolib_seq_start(struct seq_file *s, loff_t *pos) { unsigned long flags; struct gpio_device *gdev = NULL; loff_t index = *pos; s->private = ""; spin_lock_irqsave(&gpio_lock, flags); list_for_each_entry(gdev, &gpio_devices, list) if (index-- == 0) { spin_unlock_irqrestore(&gpio_lock, flags); return gdev; } spin_unlock_irqrestore(&gpio_lock, flags); return NULL; } static void *gpiolib_seq_next(struct seq_file *s, void *v, loff_t *pos) { unsigned long flags; struct gpio_device *gdev = v; void *ret = NULL; spin_lock_irqsave(&gpio_lock, flags); if (list_is_last(&gdev->list, &gpio_devices)) ret = NULL; else ret = list_first_entry(&gdev->list, struct gpio_device, list); spin_unlock_irqrestore(&gpio_lock, flags); s->private = "\n"; ++*pos; return ret; } static void gpiolib_seq_stop(struct seq_file *s, void *v) { } static int gpiolib_seq_show(struct seq_file *s, void *v) { struct gpio_device *gdev = v; struct gpio_chip *gc = gdev->chip; struct device *parent; if (!gc) { seq_printf(s, "%s%s: (dangling chip)", (char *)s->private, dev_name(&gdev->dev)); return 0; } seq_printf(s, "%s%s: GPIOs %d-%d", (char *)s->private, dev_name(&gdev->dev), gdev->base, gdev->base + gdev->ngpio - 1); parent = gc->parent; if (parent) seq_printf(s, ", parent: %s/%s", parent->bus ? parent->bus->name : "no-bus", dev_name(parent)); if (gc->label) seq_printf(s, ", %s", gc->label); if (gc->can_sleep) seq_printf(s, ", can sleep"); seq_printf(s, ":\n"); if (gc->dbg_show) gc->dbg_show(s, gc); else gpiolib_dbg_show(s, gdev); return 0; } static const struct seq_operations gpiolib_sops = { .start = gpiolib_seq_start, .next = gpiolib_seq_next, .stop = gpiolib_seq_stop, .show = gpiolib_seq_show, }; DEFINE_SEQ_ATTRIBUTE(gpiolib); static int __init gpiolib_debugfs_init(void) { /* /sys/kernel/debug/gpio */ debugfs_create_file("gpio", 0444, NULL, NULL, &gpiolib_fops); return 0; } subsys_initcall(gpiolib_debugfs_init); #endif /* DEBUG_FS */
1582 1583 1583 8 1583 1524 396 1582 1583 396 1523 1583 1580 1583 1582 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2013 Politecnico di Torino, Italy * TORSEC group -- https://security.polito.it * * Author: Roberto Sassu <roberto.sassu@polito.it> * * File: ima_template_lib.c * Library of supported template fields. */ #include "ima_template_lib.h" #include <linux/xattr.h> #include <linux/evm.h> static bool ima_template_hash_algo_allowed(u8 algo) { if (algo == HASH_ALGO_SHA1 || algo == HASH_ALGO_MD5) return true; return false; } enum data_formats { DATA_FMT_DIGEST = 0, DATA_FMT_DIGEST_WITH_ALGO, DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO, DATA_FMT_STRING, DATA_FMT_HEX, DATA_FMT_UINT }; enum digest_type { DIGEST_TYPE_IMA, DIGEST_TYPE_VERITY, DIGEST_TYPE__LAST }; #define DIGEST_TYPE_NAME_LEN_MAX 7 /* including NUL */ static const char * const digest_type_name[DIGEST_TYPE__LAST] = { [DIGEST_TYPE_IMA] = "ima", [DIGEST_TYPE_VERITY] = "verity" }; static int ima_write_template_field_data(const void *data, const u32 datalen, enum data_formats datafmt, struct ima_field_data *field_data) { u8 *buf, *buf_ptr; u32 buflen = datalen; if (datafmt == DATA_FMT_STRING) buflen = datalen + 1; buf = kzalloc(buflen, GFP_KERNEL); if (!buf) return -ENOMEM; memcpy(buf, data, datalen); /* * Replace all space characters with underscore for event names and * strings. This avoid that, during the parsing of a measurements list, * filenames with spaces or that end with the suffix ' (deleted)' are * split into multiple template fields (the space is the delimitator * character for measurements lists in ASCII format). */ if (datafmt == DATA_FMT_STRING) { for (buf_ptr = buf; buf_ptr - buf < datalen; buf_ptr++) if (*buf_ptr == ' ') *buf_ptr = '_'; } field_data->data = buf; field_data->len = buflen; return 0; } static void ima_show_template_data_ascii(struct seq_file *m, enum ima_show_type show, enum data_formats datafmt, struct ima_field_data *field_data) { u8 *buf_ptr = field_data->data; u32 buflen = field_data->len; switch (datafmt) { case DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO: case DATA_FMT_DIGEST_WITH_ALGO: buf_ptr = strrchr(field_data->data, ':'); if (buf_ptr != field_data->data) seq_printf(m, "%s", field_data->data); /* skip ':' and '\0' */ buf_ptr += 2; buflen -= buf_ptr - field_data->data; fallthrough; case DATA_FMT_DIGEST: case DATA_FMT_HEX: if (!buflen) break; ima_print_digest(m, buf_ptr, buflen); break; case DATA_FMT_STRING: seq_printf(m, "%s", buf_ptr); break; case DATA_FMT_UINT: switch (field_data->len) { case sizeof(u8): seq_printf(m, "%u", *(u8 *)buf_ptr); break; case sizeof(u16): if (ima_canonical_fmt) seq_printf(m, "%u", le16_to_cpu(*(__le16 *)buf_ptr)); else seq_printf(m, "%u", *(u16 *)buf_ptr); break; case sizeof(u32): if (ima_canonical_fmt) seq_printf(m, "%u", le32_to_cpu(*(__le32 *)buf_ptr)); else seq_printf(m, "%u", *(u32 *)buf_ptr); break; case sizeof(u64): if (ima_canonical_fmt) seq_printf(m, "%llu", le64_to_cpu(*(__le64 *)buf_ptr)); else seq_printf(m, "%llu", *(u64 *)buf_ptr); break; default: break; } break; default: break; } } static void ima_show_template_data_binary(struct seq_file *m, enum ima_show_type show, enum data_formats datafmt, struct ima_field_data *field_data) { u32 len = (show == IMA_SHOW_BINARY_OLD_STRING_FMT) ? strlen(field_data->data) : field_data->len; if (show != IMA_SHOW_BINARY_NO_FIELD_LEN) { u32 field_len = !ima_canonical_fmt ? len : (__force u32)cpu_to_le32(len); ima_putc(m, &field_len, sizeof(field_len)); } if (!len) return; ima_putc(m, field_data->data, len); } static void ima_show_template_field_data(struct seq_file *m, enum ima_show_type show, enum data_formats datafmt, struct ima_field_data *field_data) { switch (show) { case IMA_SHOW_ASCII: ima_show_template_data_ascii(m, show, datafmt, field_data); break; case IMA_SHOW_BINARY: case IMA_SHOW_BINARY_NO_FIELD_LEN: case IMA_SHOW_BINARY_OLD_STRING_FMT: ima_show_template_data_binary(m, show, datafmt, field_data); break; default: break; } } void ima_show_template_digest(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_DIGEST, field_data); } void ima_show_template_digest_ng(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_DIGEST_WITH_ALGO, field_data); } void ima_show_template_digest_ngv2(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO, field_data); } void ima_show_template_string(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_STRING, field_data); } void ima_show_template_sig(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_HEX, field_data); } void ima_show_template_buf(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_HEX, field_data); } void ima_show_template_uint(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_UINT, field_data); } /** * ima_parse_buf() - Parses lengths and data from an input buffer * @bufstartp: Buffer start address. * @bufendp: Buffer end address. * @bufcurp: Pointer to remaining (non-parsed) data. * @maxfields: Length of fields array. * @fields: Array containing lengths and pointers of parsed data. * @curfields: Number of array items containing parsed data. * @len_mask: Bitmap (if bit is set, data length should not be parsed). * @enforce_mask: Check if curfields == maxfields and/or bufcurp == bufendp. * @bufname: String identifier of the input buffer. * * Return: 0 on success, -EINVAL on error. */ int ima_parse_buf(void *bufstartp, void *bufendp, void **bufcurp, int maxfields, struct ima_field_data *fields, int *curfields, unsigned long *len_mask, int enforce_mask, char *bufname) { void *bufp = bufstartp; int i; for (i = 0; i < maxfields; i++) { if (len_mask == NULL || !test_bit(i, len_mask)) { if (bufp > (bufendp - sizeof(u32))) break; if (ima_canonical_fmt) fields[i].len = le32_to_cpu(*(__le32 *)bufp); else fields[i].len = *(u32 *)bufp; bufp += sizeof(u32); } if (bufp > (bufendp - fields[i].len)) break; fields[i].data = bufp; bufp += fields[i].len; } if ((enforce_mask & ENFORCE_FIELDS) && i != maxfields) { pr_err("%s: nr of fields mismatch: expected: %d, current: %d\n", bufname, maxfields, i); return -EINVAL; } if ((enforce_mask & ENFORCE_BUFEND) && bufp != bufendp) { pr_err("%s: buf end mismatch: expected: %p, current: %p\n", bufname, bufendp, bufp); return -EINVAL; } if (curfields) *curfields = i; if (bufcurp) *bufcurp = bufp; return 0; } static int ima_eventdigest_init_common(const u8 *digest, u32 digestsize, u8 digest_type, u8 hash_algo, struct ima_field_data *field_data) { /* * digest formats: * - DATA_FMT_DIGEST: digest * - DATA_FMT_DIGEST_WITH_ALGO: <hash algo> + ':' + '\0' + digest, * - DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO: * <digest type> + ':' + <hash algo> + ':' + '\0' + digest, * * where 'DATA_FMT_DIGEST' is the original digest format ('d') * with a hash size limitation of 20 bytes, * where <digest type> is either "ima" or "verity", * where <hash algo> is the hash_algo_name[] string. */ u8 buffer[DIGEST_TYPE_NAME_LEN_MAX + CRYPTO_MAX_ALG_NAME + 2 + IMA_MAX_DIGEST_SIZE] = { 0 }; enum data_formats fmt = DATA_FMT_DIGEST; u32 offset = 0; if (digest_type < DIGEST_TYPE__LAST && hash_algo < HASH_ALGO__LAST) { fmt = DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO; offset += 1 + sprintf(buffer, "%s:%s:", digest_type_name[digest_type], hash_algo_name[hash_algo]); } else if (hash_algo < HASH_ALGO__LAST) { fmt = DATA_FMT_DIGEST_WITH_ALGO; offset += 1 + sprintf(buffer, "%s:", hash_algo_name[hash_algo]); } if (digest) memcpy(buffer + offset, digest, digestsize); else /* * If digest is NULL, the event being recorded is a violation. * Make room for the digest by increasing the offset by the * hash algorithm digest size. */ offset += hash_digest_size[hash_algo]; return ima_write_template_field_data(buffer, offset + digestsize, fmt, field_data); } /* * This function writes the digest of an event (with size limit). */ int ima_eventdigest_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct ima_max_digest_data hash; u8 *cur_digest = NULL; u32 cur_digestsize = 0; struct inode *inode; int result; memset(&hash, 0, sizeof(hash)); if (event_data->violation) /* recording a violation. */ goto out; if (ima_template_hash_algo_allowed(event_data->iint->ima_hash->algo)) { cur_digest = event_data->iint->ima_hash->digest; cur_digestsize = event_data->iint->ima_hash->length; goto out; } if ((const char *)event_data->filename == boot_aggregate_name) { if (ima_tpm_chip) { hash.hdr.algo = HASH_ALGO_SHA1; result = ima_calc_boot_aggregate(&hash.hdr); /* algo can change depending on available PCR banks */ if (!result && hash.hdr.algo != HASH_ALGO_SHA1) result = -EINVAL; if (result < 0) memset(&hash, 0, sizeof(hash)); } cur_digest = hash.hdr.digest; cur_digestsize = hash_digest_size[HASH_ALGO_SHA1]; goto out; } if (!event_data->file) /* missing info to re-calculate the digest */ return -EINVAL; inode = file_inode(event_data->file); hash.hdr.algo = ima_template_hash_algo_allowed(ima_hash_algo) ? ima_hash_algo : HASH_ALGO_SHA1; result = ima_calc_file_hash(event_data->file, &hash.hdr); if (result) { integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, event_data->filename, "collect_data", "failed", result, 0); return result; } cur_digest = hash.hdr.digest; cur_digestsize = hash.hdr.length; out: return ima_eventdigest_init_common(cur_digest, cur_digestsize, DIGEST_TYPE__LAST, HASH_ALGO__LAST, field_data); } /* * This function writes the digest of an event (without size limit). */ int ima_eventdigest_ng_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { u8 *cur_digest = NULL, hash_algo = ima_hash_algo; u32 cur_digestsize = 0; if (event_data->violation) /* recording a violation. */ goto out; cur_digest = event_data->iint->ima_hash->digest; cur_digestsize = event_data->iint->ima_hash->length; hash_algo = event_data->iint->ima_hash->algo; out: return ima_eventdigest_init_common(cur_digest, cur_digestsize, DIGEST_TYPE__LAST, hash_algo, field_data); } /* * This function writes the digest of an event (without size limit), * prefixed with both the digest type and hash algorithm. */ int ima_eventdigest_ngv2_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { u8 *cur_digest = NULL, hash_algo = ima_hash_algo; u32 cur_digestsize = 0; u8 digest_type = DIGEST_TYPE_IMA; if (event_data->violation) /* recording a violation. */ goto out; cur_digest = event_data->iint->ima_hash->digest; cur_digestsize = event_data->iint->ima_hash->length; hash_algo = event_data->iint->ima_hash->algo; if (event_data->iint->flags & IMA_VERITY_REQUIRED) digest_type = DIGEST_TYPE_VERITY; out: return ima_eventdigest_init_common(cur_digest, cur_digestsize, digest_type, hash_algo, field_data); } /* * This function writes the digest of the file which is expected to match the * digest contained in the file's appended signature. */ int ima_eventdigest_modsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { enum hash_algo hash_algo; const u8 *cur_digest; u32 cur_digestsize; if (!event_data->modsig) return 0; if (event_data->violation) { /* Recording a violation. */ hash_algo = HASH_ALGO_SHA1; cur_digest = NULL; cur_digestsize = 0; } else { int rc; rc = ima_get_modsig_digest(event_data->modsig, &hash_algo, &cur_digest, &cur_digestsize); if (rc) return rc; else if (hash_algo == HASH_ALGO__LAST || cur_digestsize == 0) /* There was some error collecting the digest. */ return -EINVAL; } return ima_eventdigest_init_common(cur_digest, cur_digestsize, DIGEST_TYPE__LAST, hash_algo, field_data); } static int ima_eventname_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, bool size_limit) { const char *cur_filename = NULL; u32 cur_filename_len = 0; BUG_ON(event_data->filename == NULL && event_data->file == NULL); if (event_data->filename) { cur_filename = event_data->filename; cur_filename_len = strlen(event_data->filename); if (!size_limit || cur_filename_len <= IMA_EVENT_NAME_LEN_MAX) goto out; } if (event_data->file) { cur_filename = event_data->file->f_path.dentry->d_name.name; cur_filename_len = strlen(cur_filename); } else /* * Truncate filename if the latter is too long and * the file descriptor is not available. */ cur_filename_len = IMA_EVENT_NAME_LEN_MAX; out: return ima_write_template_field_data(cur_filename, cur_filename_len, DATA_FMT_STRING, field_data); } /* * This function writes the name of an event (with size limit). */ int ima_eventname_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventname_init_common(event_data, field_data, true); } /* * This function writes the name of an event (without size limit). */ int ima_eventname_ng_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventname_init_common(event_data, field_data, false); } /* * ima_eventsig_init - include the file signature as part of the template data */ int ima_eventsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct evm_ima_xattr_data *xattr_value = event_data->xattr_value; if (!xattr_value || (xattr_value->type != EVM_IMA_XATTR_DIGSIG && xattr_value->type != IMA_VERITY_DIGSIG)) return ima_eventevmsig_init(event_data, field_data); return ima_write_template_field_data(xattr_value, event_data->xattr_len, DATA_FMT_HEX, field_data); } /* * ima_eventbuf_init - include the buffer(kexec-cmldine) as part of the * template data. */ int ima_eventbuf_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { if ((!event_data->buf) || (event_data->buf_len == 0)) return 0; return ima_write_template_field_data(event_data->buf, event_data->buf_len, DATA_FMT_HEX, field_data); } /* * ima_eventmodsig_init - include the appended file signature as part of the * template data */ int ima_eventmodsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { const void *data; u32 data_len; int rc; if (!event_data->modsig) return 0; /* * modsig is a runtime structure containing pointers. Get its raw data * instead. */ rc = ima_get_raw_modsig(event_data->modsig, &data, &data_len); if (rc) return rc; return ima_write_template_field_data(data, data_len, DATA_FMT_HEX, field_data); } /* * ima_eventevmsig_init - include the EVM portable signature as part of the * template data */ int ima_eventevmsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct evm_ima_xattr_data *xattr_data = NULL; int rc = 0; if (!event_data->file) return 0; rc = vfs_getxattr_alloc(&nop_mnt_idmap, file_dentry(event_data->file), XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0 || xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG) { rc = 0; goto out; } rc = ima_write_template_field_data((char *)xattr_data, rc, DATA_FMT_HEX, field_data); out: kfree(xattr_data); return rc; } static int ima_eventinodedac_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, bool get_uid) { unsigned int id; if (!event_data->file) return 0; if (get_uid) id = i_uid_read(file_inode(event_data->file)); else id = i_gid_read(file_inode(event_data->file)); if (ima_canonical_fmt) { if (sizeof(id) == sizeof(u16)) id = (__force u16)cpu_to_le16(id); else id = (__force u32)cpu_to_le32(id); } return ima_write_template_field_data((void *)&id, sizeof(id), DATA_FMT_UINT, field_data); } /* * ima_eventinodeuid_init - include the inode UID as part of the template * data */ int ima_eventinodeuid_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodedac_init_common(event_data, field_data, true); } /* * ima_eventinodegid_init - include the inode GID as part of the template * data */ int ima_eventinodegid_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodedac_init_common(event_data, field_data, false); } /* * ima_eventinodemode_init - include the inode mode as part of the template * data */ int ima_eventinodemode_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct inode *inode; u16 mode; if (!event_data->file) return 0; inode = file_inode(event_data->file); mode = inode->i_mode; if (ima_canonical_fmt) mode = (__force u16)cpu_to_le16(mode); return ima_write_template_field_data((char *)&mode, sizeof(mode), DATA_FMT_UINT, field_data); } static int ima_eventinodexattrs_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, char type) { u8 *buffer = NULL; int rc; if (!event_data->file) return 0; rc = evm_read_protected_xattrs(file_dentry(event_data->file), NULL, 0, type, ima_canonical_fmt); if (rc < 0) return 0; buffer = kmalloc(rc, GFP_KERNEL); if (!buffer) return 0; rc = evm_read_protected_xattrs(file_dentry(event_data->file), buffer, rc, type, ima_canonical_fmt); if (rc < 0) { rc = 0; goto out; } rc = ima_write_template_field_data((char *)buffer, rc, DATA_FMT_HEX, field_data); out: kfree(buffer); return rc; } /* * ima_eventinodexattrnames_init - include a list of xattr names as part of the * template data */ int ima_eventinodexattrnames_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodexattrs_init_common(event_data, field_data, 'n'); } /* * ima_eventinodexattrlengths_init - include a list of xattr lengths as part of * the template data */ int ima_eventinodexattrlengths_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodexattrs_init_common(event_data, field_data, 'l'); } /* * ima_eventinodexattrvalues_init - include a list of xattr values as part of * the template data */ int ima_eventinodexattrvalues_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodexattrs_init_common(event_data, field_data, 'v'); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 /* SPDX-License-Identifier: GPL-2.0-only */ /* * VMware vSockets Driver * * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. */ #ifndef __AF_VSOCK_H__ #define __AF_VSOCK_H__ #include <linux/kernel.h> #include <linux/workqueue.h> #include <net/sock.h> #include <uapi/linux/vm_sockets.h> #include "vsock_addr.h" #define LAST_RESERVED_PORT 1023 #define VSOCK_HASH_SIZE 251 extern struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; extern struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; extern spinlock_t vsock_table_lock; #define vsock_sk(__sk) ((struct vsock_sock *)__sk) #define sk_vsock(__vsk) (&(__vsk)->sk) struct vsock_sock { /* sk must be the first member. */ struct sock sk; const struct vsock_transport *transport; struct sockaddr_vm local_addr; struct sockaddr_vm remote_addr; /* Links for the global tables of bound and connected sockets. */ struct list_head bound_table; struct list_head connected_table; /* Accessed without the socket lock held. This means it can never be * modified outsided of socket create or destruct. */ bool trusted; bool cached_peer_allow_dgram; /* Dgram communication allowed to * cached peer? */ u32 cached_peer; /* Context ID of last dgram destination check. */ const struct cred *owner; /* Rest are SOCK_STREAM only. */ long connect_timeout; /* Listening socket that this came from. */ struct sock *listener; /* Used for pending list and accept queue during connection handshake. * The listening socket is the head for both lists. Sockets created * for connection requests are placed in the pending list until they * are connected, at which point they are put in the accept queue list * so they can be accepted in accept(). If accept() cannot accept the * connection, it is marked as rejected so the cleanup function knows * to clean up the socket. */ struct list_head pending_links; struct list_head accept_queue; bool rejected; struct delayed_work connect_work; struct delayed_work pending_work; struct delayed_work close_work; bool close_work_scheduled; u32 peer_shutdown; bool sent_request; bool ignore_connecting_rst; /* Protected by lock_sock(sk) */ u64 buffer_size; u64 buffer_min_size; u64 buffer_max_size; /* Private to transport. */ void *trans; }; s64 vsock_connectible_has_data(struct vsock_sock *vsk); s64 vsock_stream_has_data(struct vsock_sock *vsk); s64 vsock_stream_has_space(struct vsock_sock *vsk); struct sock *vsock_create_connected(struct sock *parent); void vsock_data_ready(struct sock *sk); /**** TRANSPORT ****/ struct vsock_transport_recv_notify_data { u64 data1; /* Transport-defined. */ u64 data2; /* Transport-defined. */ bool notify_on_block; }; struct vsock_transport_send_notify_data { u64 data1; /* Transport-defined. */ u64 data2; /* Transport-defined. */ }; /* Transport features flags */ /* Transport provides host->guest communication */ #define VSOCK_TRANSPORT_F_H2G 0x00000001 /* Transport provides guest->host communication */ #define VSOCK_TRANSPORT_F_G2H 0x00000002 /* Transport provides DGRAM communication */ #define VSOCK_TRANSPORT_F_DGRAM 0x00000004 /* Transport provides local (loopback) communication */ #define VSOCK_TRANSPORT_F_LOCAL 0x00000008 struct vsock_transport { struct module *module; /* Initialize/tear-down socket. */ int (*init)(struct vsock_sock *, struct vsock_sock *); void (*destruct)(struct vsock_sock *); void (*release)(struct vsock_sock *); /* Cancel all pending packets sent on vsock. */ int (*cancel_pkt)(struct vsock_sock *vsk); /* Connections. */ int (*connect)(struct vsock_sock *); /* DGRAM. */ int (*dgram_bind)(struct vsock_sock *, struct sockaddr_vm *); int (*dgram_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, size_t len, int flags); int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *, struct msghdr *, size_t len); bool (*dgram_allow)(u32 cid, u32 port); /* STREAM. */ /* TODO: stream_bind() */ ssize_t (*stream_dequeue)(struct vsock_sock *, struct msghdr *, size_t len, int flags); ssize_t (*stream_enqueue)(struct vsock_sock *, struct msghdr *, size_t len); s64 (*stream_has_data)(struct vsock_sock *); s64 (*stream_has_space)(struct vsock_sock *); u64 (*stream_rcvhiwat)(struct vsock_sock *); bool (*stream_is_active)(struct vsock_sock *); bool (*stream_allow)(u32 cid, u32 port); /* SEQ_PACKET. */ ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, int flags); int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg, size_t len); bool (*seqpacket_allow)(u32 remote_cid); u32 (*seqpacket_has_data)(struct vsock_sock *vsk); /* Notification. */ int (*notify_poll_in)(struct vsock_sock *, size_t, bool *); int (*notify_poll_out)(struct vsock_sock *, size_t, bool *); int (*notify_recv_init)(struct vsock_sock *, size_t, struct vsock_transport_recv_notify_data *); int (*notify_recv_pre_block)(struct vsock_sock *, size_t, struct vsock_transport_recv_notify_data *); int (*notify_recv_pre_dequeue)(struct vsock_sock *, size_t, struct vsock_transport_recv_notify_data *); int (*notify_recv_post_dequeue)(struct vsock_sock *, size_t, ssize_t, bool, struct vsock_transport_recv_notify_data *); int (*notify_send_init)(struct vsock_sock *, struct vsock_transport_send_notify_data *); int (*notify_send_pre_block)(struct vsock_sock *, struct vsock_transport_send_notify_data *); int (*notify_send_pre_enqueue)(struct vsock_sock *, struct vsock_transport_send_notify_data *); int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t, struct vsock_transport_send_notify_data *); /* sk_lock held by the caller */ void (*notify_buffer_size)(struct vsock_sock *, u64 *); int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val); /* Shutdown. */ int (*shutdown)(struct vsock_sock *, int); /* Addressing. */ u32 (*get_local_cid)(void); /* Read a single skb */ int (*read_skb)(struct vsock_sock *, skb_read_actor_t); /* Zero-copy. */ bool (*msgzerocopy_allow)(void); }; /**** CORE ****/ int vsock_core_register(const struct vsock_transport *t, int features); void vsock_core_unregister(const struct vsock_transport *t); /* The transport may downcast this to access transport-specific functions */ const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk); /**** UTILS ****/ /* vsock_table_lock must be held */ static inline bool __vsock_in_bound_table(struct vsock_sock *vsk) { return !list_empty(&vsk->bound_table); } /* vsock_table_lock must be held */ static inline bool __vsock_in_connected_table(struct vsock_sock *vsk) { return !list_empty(&vsk->connected_table); } void vsock_add_pending(struct sock *listener, struct sock *pending); void vsock_remove_pending(struct sock *listener, struct sock *pending); void vsock_enqueue_accept(struct sock *listener, struct sock *connected); void vsock_insert_connected(struct vsock_sock *vsk); void vsock_remove_bound(struct vsock_sock *vsk); void vsock_remove_connected(struct vsock_sock *vsk); struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); void vsock_remove_sock(struct vsock_sock *vsk); void vsock_for_each_connected_socket(struct vsock_transport *transport, void (*fn)(struct sock *sk)); int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); bool vsock_find_cid(unsigned int cid); /**** TAP ****/ struct vsock_tap { struct net_device *dev; struct module *module; struct list_head list; }; int vsock_add_tap(struct vsock_tap *vt); int vsock_remove_tap(struct vsock_tap *vt); void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque); int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); #ifdef CONFIG_BPF_SYSCALL extern struct proto vsock_proto; int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void __init vsock_bpf_build_proto(void); #else static inline void __init vsock_bpf_build_proto(void) {} #endif static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t) { return t->msgzerocopy_allow && t->msgzerocopy_allow(); } #endif /* __AF_VSOCK_H__ */
11 11 11 11 1 11 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 // SPDX-License-Identifier: GPL-2.0-or-later /* * Scatterlist Cryptographic API. * * Procfs information. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> */ #include <linux/atomic.h> #include <linux/init.h> #include <linux/crypto.h> #include <linux/fips.h> #include <linux/module.h> /* for module_name() */ #include <linux/rwsem.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include "internal.h" static void *c_start(struct seq_file *m, loff_t *pos) { down_read(&crypto_alg_sem); return seq_list_start(&crypto_alg_list, *pos); } static void *c_next(struct seq_file *m, void *p, loff_t *pos) { return seq_list_next(p, &crypto_alg_list, pos); } static void c_stop(struct seq_file *m, void *p) { up_read(&crypto_alg_sem); } static int c_show(struct seq_file *m, void *p) { struct crypto_alg *alg = list_entry(p, struct crypto_alg, cra_list); seq_printf(m, "name : %s\n", alg->cra_name); seq_printf(m, "driver : %s\n", alg->cra_driver_name); seq_printf(m, "module : %s\n", module_name(alg->cra_module)); seq_printf(m, "priority : %d\n", alg->cra_priority); seq_printf(m, "refcnt : %u\n", refcount_read(&alg->cra_refcnt)); seq_printf(m, "selftest : %s\n", (alg->cra_flags & CRYPTO_ALG_TESTED) ? "passed" : "unknown"); seq_printf(m, "internal : %s\n", (alg->cra_flags & CRYPTO_ALG_INTERNAL) ? "yes" : "no"); if (fips_enabled) { seq_printf(m, "fips : %s\n", (alg->cra_flags & CRYPTO_ALG_FIPS_INTERNAL) ? "no" : "yes"); } if (alg->cra_flags & CRYPTO_ALG_LARVAL) { seq_printf(m, "type : larval\n"); seq_printf(m, "flags : 0x%x\n", alg->cra_flags); goto out; } if (alg->cra_type && alg->cra_type->show) { alg->cra_type->show(m, alg); goto out; } switch (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) { case CRYPTO_ALG_TYPE_CIPHER: seq_printf(m, "type : cipher\n"); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "min keysize : %u\n", alg->cra_cipher.cia_min_keysize); seq_printf(m, "max keysize : %u\n", alg->cra_cipher.cia_max_keysize); break; case CRYPTO_ALG_TYPE_COMPRESS: seq_printf(m, "type : compression\n"); break; default: seq_printf(m, "type : unknown\n"); break; } out: seq_putc(m, '\n'); return 0; } static const struct seq_operations crypto_seq_ops = { .start = c_start, .next = c_next, .stop = c_stop, .show = c_show }; void __init crypto_init_proc(void) { proc_create_seq("crypto", 0, NULL, &crypto_seq_ops); } void __exit crypto_exit_proc(void) { remove_proc_entry("crypto", NULL); }
2 2 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" struct devlink_linecard { struct list_head list; struct devlink *devlink; unsigned int index; const struct devlink_linecard_ops *ops; void *priv; enum devlink_linecard_state state; struct mutex state_lock; /* Protects state */ const char *type; struct devlink_linecard_type *types; unsigned int types_count; u32 rel_index; }; unsigned int devlink_linecard_index(struct devlink_linecard *linecard) { return linecard->index; } static struct devlink_linecard * devlink_linecard_get_by_index(struct devlink *devlink, unsigned int linecard_index) { struct devlink_linecard *devlink_linecard; list_for_each_entry(devlink_linecard, &devlink->linecard_list, list) { if (devlink_linecard->index == linecard_index) return devlink_linecard; } return NULL; } static bool devlink_linecard_index_exists(struct devlink *devlink, unsigned int linecard_index) { return devlink_linecard_get_by_index(devlink, linecard_index); } static struct devlink_linecard * devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { if (attrs[DEVLINK_ATTR_LINECARD_INDEX]) { u32 linecard_index = nla_get_u32(attrs[DEVLINK_ATTR_LINECARD_INDEX]); struct devlink_linecard *linecard; linecard = devlink_linecard_get_by_index(devlink, linecard_index); if (!linecard) return ERR_PTR(-ENODEV); return linecard; } return ERR_PTR(-EINVAL); } static struct devlink_linecard * devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_linecard_get_from_attrs(devlink, info->attrs); } struct devlink_linecard_type { const char *type; const void *priv; }; static int devlink_nl_linecard_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_linecard *linecard, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct devlink_linecard_type *linecard_type; struct nlattr *attr; void *hdr; int i; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index)) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_LINECARD_STATE, linecard->state)) goto nla_put_failure; if (linecard->type && nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, linecard->type)) goto nla_put_failure; if (linecard->types_count) { attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES); if (!attr) goto nla_put_failure; for (i = 0; i < linecard->types_count; i++) { linecard_type = &linecard->types[i]; if (nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, linecard_type->type)) { nla_nest_cancel(msg, attr); goto nla_put_failure; } } nla_nest_end(msg, attr); } if (devlink_rel_devlink_handle_put(msg, devlink, linecard->rel_index, DEVLINK_ATTR_NESTED_DEVLINK, NULL)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_linecard_notify(struct devlink_linecard *linecard, enum devlink_command cmd) { struct devlink *devlink = linecard->devlink; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_LINECARD_NEW && cmd != DEVLINK_CMD_LINECARD_DEL); if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_linecard_fill(msg, devlink, linecard, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_linecards_notify_register(struct devlink *devlink) { struct devlink_linecard *linecard; list_for_each_entry(linecard, &devlink->linecard_list, list) devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); } void devlink_linecards_notify_unregister(struct devlink *devlink) { struct devlink_linecard *linecard; list_for_each_entry_reverse(linecard, &devlink->linecard_list, list) devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); } int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_linecard *linecard; struct sk_buff *msg; int err; linecard = devlink_linecard_get_from_info(devlink, info); if (IS_ERR(linecard)) return PTR_ERR(linecard); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; mutex_lock(&linecard->state_lock); err = devlink_nl_linecard_fill(msg, devlink, linecard, DEVLINK_CMD_LINECARD_NEW, info->snd_portid, info->snd_seq, 0, info->extack); mutex_unlock(&linecard->state_lock); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_linecard_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_linecard *linecard; int idx = 0; int err = 0; list_for_each_entry(linecard, &devlink->linecard_list, list) { if (idx < state->idx) { idx++; continue; } mutex_lock(&linecard->state_lock); err = devlink_nl_linecard_fill(msg, devlink, linecard, DEVLINK_CMD_LINECARD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, cb->extack); mutex_unlock(&linecard->state_lock); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_linecard_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_linecard_get_dump_one); } static struct devlink_linecard_type * devlink_linecard_type_lookup(struct devlink_linecard *linecard, const char *type) { struct devlink_linecard_type *linecard_type; int i; for (i = 0; i < linecard->types_count; i++) { linecard_type = &linecard->types[i]; if (!strcmp(type, linecard_type->type)) return linecard_type; } return NULL; } static int devlink_linecard_type_set(struct devlink_linecard *linecard, const char *type, struct netlink_ext_ack *extack) { const struct devlink_linecard_ops *ops = linecard->ops; struct devlink_linecard_type *linecard_type; int err; mutex_lock(&linecard->state_lock); if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); err = -EBUSY; goto out; } if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); err = -EBUSY; goto out; } linecard_type = devlink_linecard_type_lookup(linecard, type); if (!linecard_type) { NL_SET_ERR_MSG(extack, "Unsupported line card type provided"); err = -EINVAL; goto out; } if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED && linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { NL_SET_ERR_MSG(extack, "Line card already provisioned"); err = -EBUSY; /* Check if the line card is provisioned in the same * way the user asks. In case it is, make the operation * to return success. */ if (ops->same_provision && ops->same_provision(linecard, linecard->priv, linecard_type->type, linecard_type->priv)) err = 0; goto out; } linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING; linecard->type = linecard_type->type; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); err = ops->provision(linecard, linecard->priv, linecard_type->type, linecard_type->priv, extack); if (err) { /* Provisioning failed. Assume the linecard is unprovisioned * for future operations. */ mutex_lock(&linecard->state_lock); linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; linecard->type = NULL; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } return err; out: mutex_unlock(&linecard->state_lock); return err; } static int devlink_linecard_type_unset(struct devlink_linecard *linecard, struct netlink_ext_ack *extack) { int err; mutex_lock(&linecard->state_lock); if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); err = -EBUSY; goto out; } if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); err = -EBUSY; goto out; } if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; linecard->type = NULL; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); err = 0; goto out; } if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) { NL_SET_ERR_MSG(extack, "Line card is not provisioned"); err = 0; goto out; } linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); err = linecard->ops->unprovision(linecard, linecard->priv, extack); if (err) { /* Unprovisioning failed. Assume the linecard is unprovisioned * for future operations. */ mutex_lock(&linecard->state_lock); linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; linecard->type = NULL; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } return err; out: mutex_unlock(&linecard->state_lock); return err; } int devlink_nl_linecard_set_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_linecard *linecard; int err; linecard = devlink_linecard_get_from_info(devlink, info); if (IS_ERR(linecard)) return PTR_ERR(linecard); if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) { const char *type; type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]); if (strcmp(type, "")) { err = devlink_linecard_type_set(linecard, type, extack); if (err) return err; } else { err = devlink_linecard_type_unset(linecard, extack); if (err) return err; } } return 0; } static int devlink_linecard_types_init(struct devlink_linecard *linecard) { struct devlink_linecard_type *linecard_type; unsigned int count; int i; count = linecard->ops->types_count(linecard, linecard->priv); linecard->types = kmalloc_array(count, sizeof(*linecard_type), GFP_KERNEL); if (!linecard->types) return -ENOMEM; linecard->types_count = count; for (i = 0; i < count; i++) { linecard_type = &linecard->types[i]; linecard->ops->types_get(linecard, linecard->priv, i, &linecard_type->type, &linecard_type->priv); } return 0; } static void devlink_linecard_types_fini(struct devlink_linecard *linecard) { kfree(linecard->types); } /** * devl_linecard_create - Create devlink linecard * * @devlink: devlink * @linecard_index: driver-specific numerical identifier of the linecard * @ops: linecards ops * @priv: user priv pointer * * Create devlink linecard instance with provided linecard index. * Caller can use any indexing, even hw-related one. * * Return: Line card structure or an ERR_PTR() encoded error code. */ struct devlink_linecard * devl_linecard_create(struct devlink *devlink, unsigned int linecard_index, const struct devlink_linecard_ops *ops, void *priv) { struct devlink_linecard *linecard; int err; if (WARN_ON(!ops || !ops->provision || !ops->unprovision || !ops->types_count || !ops->types_get)) return ERR_PTR(-EINVAL); if (devlink_linecard_index_exists(devlink, linecard_index)) return ERR_PTR(-EEXIST); linecard = kzalloc(sizeof(*linecard), GFP_KERNEL); if (!linecard) return ERR_PTR(-ENOMEM); linecard->devlink = devlink; linecard->index = linecard_index; linecard->ops = ops; linecard->priv = priv; linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; mutex_init(&linecard->state_lock); err = devlink_linecard_types_init(linecard); if (err) { mutex_destroy(&linecard->state_lock); kfree(linecard); return ERR_PTR(err); } list_add_tail(&linecard->list, &devlink->linecard_list); devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); return linecard; } EXPORT_SYMBOL_GPL(devl_linecard_create); /** * devl_linecard_destroy - Destroy devlink linecard * * @linecard: devlink linecard */ void devl_linecard_destroy(struct devlink_linecard *linecard) { devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); list_del(&linecard->list); devlink_linecard_types_fini(linecard); mutex_destroy(&linecard->state_lock); kfree(linecard); } EXPORT_SYMBOL_GPL(devl_linecard_destroy); /** * devlink_linecard_provision_set - Set provisioning on linecard * * @linecard: devlink linecard * @type: linecard type * * This is either called directly from the provision() op call or * as a result of the provision() op call asynchronously. */ void devlink_linecard_provision_set(struct devlink_linecard *linecard, const char *type) { mutex_lock(&linecard->state_lock); WARN_ON(linecard->type && strcmp(linecard->type, type)); linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; linecard->type = type; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } EXPORT_SYMBOL_GPL(devlink_linecard_provision_set); /** * devlink_linecard_provision_clear - Clear provisioning on linecard * * @linecard: devlink linecard * * This is either called directly from the unprovision() op call or * as a result of the unprovision() op call asynchronously. */ void devlink_linecard_provision_clear(struct devlink_linecard *linecard) { mutex_lock(&linecard->state_lock); linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; linecard->type = NULL; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear); /** * devlink_linecard_provision_fail - Fail provisioning on linecard * * @linecard: devlink linecard * * This is either called directly from the provision() op call or * as a result of the provision() op call asynchronously. */ void devlink_linecard_provision_fail(struct devlink_linecard *linecard) { mutex_lock(&linecard->state_lock); linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } EXPORT_SYMBOL_GPL(devlink_linecard_provision_fail); /** * devlink_linecard_activate - Set linecard active * * @linecard: devlink linecard */ void devlink_linecard_activate(struct devlink_linecard *linecard) { mutex_lock(&linecard->state_lock); WARN_ON(linecard->state != DEVLINK_LINECARD_STATE_PROVISIONED); linecard->state = DEVLINK_LINECARD_STATE_ACTIVE; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } EXPORT_SYMBOL_GPL(devlink_linecard_activate); /** * devlink_linecard_deactivate - Set linecard inactive * * @linecard: devlink linecard */ void devlink_linecard_deactivate(struct devlink_linecard *linecard) { mutex_lock(&linecard->state_lock); switch (linecard->state) { case DEVLINK_LINECARD_STATE_ACTIVE: linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); break; case DEVLINK_LINECARD_STATE_UNPROVISIONING: /* Line card is being deactivated as part * of unprovisioning flow. */ break; default: WARN_ON(1); break; } mutex_unlock(&linecard->state_lock); } EXPORT_SYMBOL_GPL(devlink_linecard_deactivate); static void devlink_linecard_rel_notify_cb(struct devlink *devlink, u32 linecard_index) { struct devlink_linecard *linecard; linecard = devlink_linecard_get_by_index(devlink, linecard_index); if (!linecard) return; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); } static void devlink_linecard_rel_cleanup_cb(struct devlink *devlink, u32 linecard_index, u32 rel_index) { struct devlink_linecard *linecard; linecard = devlink_linecard_get_by_index(devlink, linecard_index); if (linecard && linecard->rel_index == rel_index) linecard->rel_index = 0; } /** * devlink_linecard_nested_dl_set - Attach/detach nested devlink * instance to linecard. * * @linecard: devlink linecard * @nested_devlink: devlink instance to attach or NULL to detach */ int devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, struct devlink *nested_devlink) { return devlink_rel_nested_in_add(&linecard->rel_index, linecard->devlink->index, linecard->index, devlink_linecard_rel_notify_cb, devlink_linecard_rel_cleanup_cb, nested_devlink); } EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set);
22 85 85 85 23 46 32 61 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. */ #ifndef BTRFS_INODE_H #define BTRFS_INODE_H #include <linux/hash.h> #include <linux/refcount.h> #include <linux/fscrypt.h> #include <trace/events/btrfs.h> #include "extent_map.h" #include "extent_io.h" #include "ordered-data.h" #include "delayed-inode.h" /* * Since we search a directory based on f_pos (struct dir_context::pos) we have * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()). */ #define BTRFS_DIR_START_INDEX 2 /* * ordered_data_close is set by truncate when a file that used * to have good data has been truncated to zero. When it is set * the btrfs file release call will add this inode to the * ordered operations list so that we make sure to flush out any * new data the application may have written before commit. */ enum { BTRFS_INODE_FLUSH_ON_CLOSE, BTRFS_INODE_DUMMY, BTRFS_INODE_IN_DEFRAG, BTRFS_INODE_HAS_ASYNC_EXTENT, /* * Always set under the VFS' inode lock, otherwise it can cause races * during fsync (we start as a fast fsync and then end up in a full * fsync racing with ordered extent completion). */ BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, BTRFS_INODE_IN_DELALLOC_LIST, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, /* * Set and used when logging an inode and it serves to signal that an * inode does not have xattrs, so subsequent fsyncs can avoid searching * for xattrs to log. This bit must be cleared whenever a xattr is added * to an inode. */ BTRFS_INODE_NO_XATTRS, /* * Set when we are in a context where we need to start a transaction and * have dirty pages with the respective file range locked. This is to * ensure that when reserving space for the transaction, if we are low * on available space and need to flush delalloc, we will not flush * delalloc for this inode, because that could result in a deadlock (on * the file range, inode's io_tree). */ BTRFS_INODE_NO_DELALLOC_FLUSH, /* * Set when we are working on enabling verity for a file. Computing and * writing the whole Merkle tree can take a while so we want to prevent * races where two separate tasks attempt to simultaneously start verity * on the same file. */ BTRFS_INODE_VERITY_IN_PROGRESS, /* Set when this inode is a free space inode. */ BTRFS_INODE_FREE_SPACE_INODE, /* Set when there are no capabilities in XATTs for the inode. */ BTRFS_INODE_NO_CAP_XATTR, }; /* in memory btrfs inode */ struct btrfs_inode { /* which subvolume this inode belongs to */ struct btrfs_root *root; /* key used to find this inode on disk. This is used by the code * to read in roots of subvolumes */ struct btrfs_key location; /* Cached value of inode property 'compression'. */ u8 prop_compress; /* * Force compression on the file using the defrag ioctl, could be * different from prop_compress and takes precedence if set. */ u8 defrag_compress; /* * Lock for counters and all fields used to determine if the inode is in * the log or not (last_trans, last_sub_trans, last_log_commit, * logged_trans), to access/update delalloc_bytes, new_delalloc_bytes, * defrag_bytes, disk_i_size, outstanding_extents, csum_bytes and to * update the VFS' inode number of bytes used. */ spinlock_t lock; /* the extent_tree has caches of all the extent mappings to disk */ struct extent_map_tree extent_tree; /* the io_tree does range state (DIRTY, LOCKED etc) */ struct extent_io_tree io_tree; /* * Keep track of where the inode has extent items mapped in order to * make sure the i_size adjustments are accurate. Not required when the * filesystem is NO_HOLES, the status can't be set while mounted as * it's a mkfs-time feature. */ struct extent_io_tree *file_extent_tree; /* held while logging the inode in tree-log.c */ struct mutex log_mutex; /* * Counters to keep track of the number of extent item's we may use due * to delalloc and such. outstanding_extents is the number of extent * items we think we'll end up using, and reserved_extents is the number * of extent items we've reserved metadata for. Protected by 'lock'. */ unsigned outstanding_extents; /* used to order data wrt metadata */ spinlock_t ordered_tree_lock; struct rb_root ordered_tree; struct rb_node *ordered_tree_last; /* list of all the delalloc inodes in the FS. There are times we need * to write all the delalloc pages to disk, and this list is used * to walk them all. */ struct list_head delalloc_inodes; /* node for the red-black tree that links inodes in subvolume root */ struct rb_node rb_node; unsigned long runtime_flags; /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ u64 generation; /* * ID of the transaction handle that last modified this inode. * Protected by 'lock'. */ u64 last_trans; /* * ID of the transaction that last logged this inode. * Protected by 'lock'. */ u64 logged_trans; /* * Log transaction ID when this inode was last modified. * Protected by 'lock'. */ int last_sub_trans; /* A local copy of root's last_log_commit. Protected by 'lock'. */ int last_log_commit; union { /* * Total number of bytes pending delalloc, used by stat to * calculate the real block usage of the file. This is used * only for files. Protected by 'lock'. */ u64 delalloc_bytes; /* * The lowest possible index of the next dir index key which * points to an inode that needs to be logged. * This is used only for directories. * Use the helpers btrfs_get_first_dir_index_to_log() and * btrfs_set_first_dir_index_to_log() to access this field. */ u64 first_dir_index_to_log; }; union { /* * Total number of bytes pending delalloc that fall within a file * range that is either a hole or beyond EOF (and no prealloc extent * exists in the range). This is always <= delalloc_bytes and this * is used only for files. Protected by 'lock'. */ u64 new_delalloc_bytes; /* * The offset of the last dir index key that was logged. * This is used only for directories. */ u64 last_dir_index_offset; }; /* * Total number of bytes pending defrag, used by stat to check whether * it needs COW. Protected by 'lock'. */ u64 defrag_bytes; /* * The size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk * because not all the blocks are written yet. Protected by 'lock'. */ u64 disk_i_size; /* * If this is a directory then index_cnt is the counter for the index * number for new files that are created. For an empty directory, this * must be initialized to BTRFS_DIR_START_INDEX. */ u64 index_cnt; /* Cache the directory index number to speed the dir/file remove */ u64 dir_index; /* the fsync log has some corner cases that mean we have to check * directories to see if any unlinks have been done before * the directory was logged. See tree-log.c for all the * details */ u64 last_unlink_trans; /* * The id/generation of the last transaction where this inode was * either the source or the destination of a clone/dedupe operation. * Used when logging an inode to know if there are shared extents that * need special care when logging checksum items, to avoid duplicate * checksum items in a log (which can lead to a corruption where we end * up with missing checksum ranges after log replay). * Protected by the vfs inode lock. */ u64 last_reflink_trans; /* * Number of bytes outstanding that are going to need csums. This is * used in ENOSPC accounting. Protected by 'lock'. */ u64 csum_bytes; /* Backwards incompatible flags, lower half of inode_item::flags */ u32 flags; /* Read-only compatibility flags, upper half of inode_item::flags */ u32 ro_flags; struct btrfs_block_rsv block_rsv; struct btrfs_delayed_node *delayed_node; /* File creation time. */ u64 i_otime_sec; u32 i_otime_nsec; /* Hook into fs_info->delayed_iputs */ struct list_head delayed_iput; struct rw_semaphore i_mmap_lock; struct inode vfs_inode; }; static inline u64 btrfs_get_first_dir_index_to_log(const struct btrfs_inode *inode) { return READ_ONCE(inode->first_dir_index_to_log); } static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode, u64 index) { WRITE_ONCE(inode->first_dir_index_to_log, index); } static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); } static inline unsigned long btrfs_inode_hash(u64 objectid, const struct btrfs_root *root) { u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME); #if BITS_PER_LONG == 32 h = (h >> 32) ^ (h & 0xffffffff); #endif return (unsigned long)h; } #if BITS_PER_LONG == 32 /* * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so * we use the inode's location objectid which is a u64 to avoid truncation. */ static inline u64 btrfs_ino(const struct btrfs_inode *inode) { u64 ino = inode->location.objectid; /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */ if (inode->location.type == BTRFS_ROOT_ITEM_KEY) ino = inode->vfs_inode.i_ino; return ino; } #else static inline u64 btrfs_ino(const struct btrfs_inode *inode) { return inode->vfs_inode.i_ino; } #endif static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) { i_size_write(&inode->vfs_inode, size); inode->disk_i_size = size; } static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) { return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags); } static inline bool is_data_inode(struct inode *inode) { return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID; } static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, int mod) { lockdep_assert_held(&inode->lock); inode->outstanding_extents += mod; if (btrfs_is_free_space_inode(inode)) return; trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode), mod, inode->outstanding_extents); } /* * Called every time after doing a buffered, direct IO or memory mapped write. * * This is to ensure that if we write to a file that was previously fsynced in * the current transaction, then try to fsync it again in the same transaction, * we will know that there were changes in the file and that it needs to be * logged. */ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode) { spin_lock(&inode->lock); inode->last_sub_trans = inode->root->log_transid; spin_unlock(&inode->lock); } /* * Should be called while holding the inode's VFS lock in exclusive mode or in a * context where no one else can access the inode concurrently (during inode * creation or when loading an inode from disk). */ static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode) { set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * The inode may have been part of a reflink operation in the last * transaction that modified it, and then a fsync has reset the * last_reflink_trans to avoid subsequent fsyncs in the same * transaction to do unnecessary work. So update last_reflink_trans * to the last_trans value (we have to be pessimistic and assume a * reflink happened). * * The ->last_trans is protected by the inode's spinlock and we can * have a concurrent ordered extent completion update it. Also set * last_reflink_trans to ->last_trans only if the former is less than * the later, because we can be called in a context where * last_reflink_trans was set to the current transaction generation * while ->last_trans was not yet updated in the current transaction, * and therefore has a lower value. */ spin_lock(&inode->lock); if (inode->last_reflink_trans < inode->last_trans) inode->last_reflink_trans = inode->last_trans; spin_unlock(&inode->lock); } static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) { bool ret = false; spin_lock(&inode->lock); if (inode->logged_trans == generation && inode->last_sub_trans <= inode->last_log_commit && inode->last_sub_trans <= btrfs_get_root_last_log_commit(inode->root)) ret = true; spin_unlock(&inode->lock); return ret; } /* * Check if the inode has flags compatible with compression */ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) { if (inode->flags & BTRFS_INODE_NODATACOW || inode->flags & BTRFS_INODE_NODATASUM) return false; return true; } /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool nowait, bool strict); void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, const struct fscrypt_str *name); int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const struct fscrypt_str *name, int add_backref, u64 index); int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int front); int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); struct btrfs_new_inode_args { /* Input */ struct inode *dir; struct dentry *dentry; struct inode *inode; bool orphan; bool subvol; /* Output from btrfs_new_inode_prepare(), input to btrfs_create_new_inode(). */ struct posix_acl *default_acl; struct posix_acl *acl; struct fscrypt_name fname; }; int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, unsigned int *trans_num_items); int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args); void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap, struct inode *dir); void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits); void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits); void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, struct extent_state *other); void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split); void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); void btrfs_evict_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); void btrfs_free_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); struct inode *btrfs_iget_path(struct super_block *s, u64 ino, struct btrfs_root *root, struct btrfs_path *path); struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); void btrfs_add_delayed_iput(struct btrfs_inode *inode); void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 disk_io_size, struct page **pages); ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, struct btrfs_ioctl_encoded_io_args *encoded); ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before); struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, size_t done_before); extern const struct dentry_operations btrfs_dentry_operations; /* Inode locking type flags, by default the exclusive lock is taken. */ enum btrfs_ilock_type { ENUM_BIT(BTRFS_ILOCK_SHARED), ENUM_BIT(BTRFS_ILOCK_TRY), ENUM_BIT(BTRFS_ILOCK_MMAP), }; int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags); void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags); void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes); void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); #endif
102 102 16 21 16 2 2 2 2 2 2 2 335 335 208 208 203 10 6 6 6 5 5 5 5 45 45 45 236 5 229 229 5 4 10 252 259 3 262 1 238 4 1 6 5 235 240 1 1 228 223 8 228 1 229 225 4 229 1 223 5 227 5 5 5 5 5 5 222 227 227 227 227 240 1 239 236 236 1 236 166 166 1 165 165 165 90 60 60 60 90 90 90 179 55 232 254 260 4 256 1 5 171 1 84 3 251 3 58 263 264 1 260 5 254 254 253 253 246 243 2 253 53 2 226 251 4 249 249 249 51 224 248 53 247 288 2 1 7 271 7 58 2 245 6 240 6 227 8 265 13 228 53 58 223 93 1 165 6 220 56 1 249 7 241 248 56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 // SPDX-License-Identifier: GPL-2.0-only /* * * Copyright (C) 2011 Novell Inc. */ #include <uapi/linux/magic.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/xattr.h> #include <linux/mount.h> #include <linux/parser.h> #include <linux/module.h> #include <linux/statfs.h> #include <linux/seq_file.h> #include <linux/posix_acl_xattr.h> #include <linux/exportfs.h> #include <linux/file.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "overlayfs.h" #include "params.h" MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Overlay filesystem"); MODULE_LICENSE("GPL"); struct ovl_dir_cache; static struct dentry *ovl_d_real(struct dentry *dentry, const struct inode *inode) { struct dentry *real = NULL, *lower; int err; /* * vfs is only expected to call d_real() with NULL from d_real_inode() * and with overlay inode from file_dentry() on an overlay file. * * TODO: remove @inode argument from d_real() API, remove code in this * function that deals with non-NULL @inode and remove d_real() call * from file_dentry(). */ if (inode && d_inode(dentry) == inode) return dentry; else if (inode) goto bug; if (!d_is_reg(dentry)) { /* d_real_inode() is only relevant for regular files */ return dentry; } real = ovl_dentry_upper(dentry); if (real && (inode == d_inode(real))) return real; if (real && !inode && ovl_has_upperdata(d_inode(dentry))) return real; /* * Best effort lazy lookup of lowerdata for !inode case to return * the real lowerdata dentry. The only current caller of d_real() with * NULL inode is d_real_inode() from trace_uprobe and this caller is * likely going to be followed reading from the file, before placing * uprobes on offset within the file, so lowerdata should be available * when setting the uprobe. */ err = ovl_verify_lowerdata(dentry); if (err) goto bug; lower = ovl_dentry_lowerdata(dentry); if (!lower) goto bug; real = lower; /* Handle recursion */ real = d_real(real, inode); if (!inode || inode == d_inode(real)) return real; bug: WARN(1, "%s(%pd4, %s:%lu): real dentry (%p/%lu) not found\n", __func__, dentry, inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0, real, real && d_inode(real) ? d_inode(real)->i_ino : 0); return dentry; } static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak) { int ret = 1; if (!d) return 1; if (weak) { if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) ret = d->d_op->d_weak_revalidate(d, flags); } else if (d->d_flags & DCACHE_OP_REVALIDATE) { ret = d->d_op->d_revalidate(d, flags); if (!ret) { if (!(flags & LOOKUP_RCU)) d_invalidate(d); ret = -ESTALE; } } return ret; } static int ovl_dentry_revalidate_common(struct dentry *dentry, unsigned int flags, bool weak) { struct ovl_entry *oe; struct ovl_path *lowerstack; struct inode *inode = d_inode_rcu(dentry); struct dentry *upper; unsigned int i; int ret = 1; /* Careful in RCU mode */ if (!inode) return -ECHILD; oe = OVL_I_E(inode); lowerstack = ovl_lowerstack(oe); upper = ovl_i_dentry_upper(inode); if (upper) ret = ovl_revalidate_real(upper, flags, weak); for (i = 0; ret > 0 && i < ovl_numlower(oe); i++) ret = ovl_revalidate_real(lowerstack[i].dentry, flags, weak); return ret; } static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags) { return ovl_dentry_revalidate_common(dentry, flags, false); } static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) { return ovl_dentry_revalidate_common(dentry, flags, true); } static const struct dentry_operations ovl_dentry_operations = { .d_real = ovl_d_real, .d_revalidate = ovl_dentry_revalidate, .d_weak_revalidate = ovl_dentry_weak_revalidate, }; static struct kmem_cache *ovl_inode_cachep; static struct inode *ovl_alloc_inode(struct super_block *sb) { struct ovl_inode *oi = alloc_inode_sb(sb, ovl_inode_cachep, GFP_KERNEL); if (!oi) return NULL; oi->cache = NULL; oi->redirect = NULL; oi->version = 0; oi->flags = 0; oi->__upperdentry = NULL; oi->lowerdata_redirect = NULL; oi->oe = NULL; mutex_init(&oi->lock); return &oi->vfs_inode; } static void ovl_free_inode(struct inode *inode) { struct ovl_inode *oi = OVL_I(inode); kfree(oi->redirect); kfree(oi->oe); mutex_destroy(&oi->lock); kmem_cache_free(ovl_inode_cachep, oi); } static void ovl_destroy_inode(struct inode *inode) { struct ovl_inode *oi = OVL_I(inode); dput(oi->__upperdentry); ovl_stack_put(ovl_lowerstack(oi->oe), ovl_numlower(oi->oe)); if (S_ISDIR(inode->i_mode)) ovl_dir_cache_free(inode); else kfree(oi->lowerdata_redirect); } static void ovl_put_super(struct super_block *sb) { struct ovl_fs *ofs = OVL_FS(sb); if (ofs) ovl_free_fs(ofs); } /* Sync real dirty inodes in upper filesystem (if it exists) */ static int ovl_sync_fs(struct super_block *sb, int wait) { struct ovl_fs *ofs = OVL_FS(sb); struct super_block *upper_sb; int ret; ret = ovl_sync_status(ofs); /* * We have to always set the err, because the return value isn't * checked in syncfs, and instead indirectly return an error via * the sb's writeback errseq, which VFS inspects after this call. */ if (ret < 0) { errseq_set(&sb->s_wb_err, -EIO); return -EIO; } if (!ret) return ret; /* * Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC). * All the super blocks will be iterated, including upper_sb. * * If this is a syncfs(2) call, then we do need to call * sync_filesystem() on upper_sb, but enough if we do it when being * called with wait == 1. */ if (!wait) return 0; upper_sb = ovl_upper_mnt(ofs)->mnt_sb; down_read(&upper_sb->s_umount); ret = sync_filesystem(upper_sb); up_read(&upper_sb->s_umount); return ret; } /** * ovl_statfs * @dentry: The dentry to query * @buf: The struct kstatfs to fill in with stats * * Get the filesystem statistics. As writes always target the upper layer * filesystem pass the statfs to the upper filesystem (if it exists) */ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct ovl_fs *ofs = OVL_FS(sb); struct dentry *root_dentry = sb->s_root; struct path path; int err; ovl_path_real(root_dentry, &path); err = vfs_statfs(&path, buf); if (!err) { buf->f_namelen = ofs->namelen; buf->f_type = OVERLAYFS_SUPER_MAGIC; if (ovl_has_fsid(ofs)) buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); } return err; } static const struct super_operations ovl_super_operations = { .alloc_inode = ovl_alloc_inode, .free_inode = ovl_free_inode, .destroy_inode = ovl_destroy_inode, .drop_inode = generic_delete_inode, .put_super = ovl_put_super, .sync_fs = ovl_sync_fs, .statfs = ovl_statfs, .show_options = ovl_show_options, }; #define OVL_WORKDIR_NAME "work" #define OVL_INDEXDIR_NAME "index" static struct dentry *ovl_workdir_create(struct ovl_fs *ofs, const char *name, bool persist) { struct inode *dir = ofs->workbasedir->d_inode; struct vfsmount *mnt = ovl_upper_mnt(ofs); struct dentry *work; int err; bool retried = false; inode_lock_nested(dir, I_MUTEX_PARENT); retry: work = ovl_lookup_upper(ofs, name, ofs->workbasedir, strlen(name)); if (!IS_ERR(work)) { struct iattr attr = { .ia_valid = ATTR_MODE, .ia_mode = S_IFDIR | 0, }; if (work->d_inode) { err = -EEXIST; if (retried) goto out_dput; if (persist) goto out_unlock; retried = true; err = ovl_workdir_cleanup(ofs, dir, mnt, work, 0); dput(work); if (err == -EINVAL) { work = ERR_PTR(err); goto out_unlock; } goto retry; } err = ovl_mkdir_real(ofs, dir, &work, attr.ia_mode); if (err) goto out_dput; /* Weird filesystem returning with hashed negative (kernfs)? */ err = -EINVAL; if (d_really_is_negative(work)) goto out_dput; /* * Try to remove POSIX ACL xattrs from workdir. We are good if: * * a) success (there was a POSIX ACL xattr and was removed) * b) -ENODATA (there was no POSIX ACL xattr) * c) -EOPNOTSUPP (POSIX ACL xattrs are not supported) * * There are various other error values that could effectively * mean that the xattr doesn't exist (e.g. -ERANGE is returned * if the xattr name is too long), but the set of filesystems * allowed as upper are limited to "normal" ones, where checking * for the above two errors is sufficient. */ err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_DEFAULT); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_ACCESS); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; /* Clear any inherited mode bits */ inode_lock(work->d_inode); err = ovl_do_notify_change(ofs, work, &attr); inode_unlock(work->d_inode); if (err) goto out_dput; } else { err = PTR_ERR(work); goto out_err; } out_unlock: inode_unlock(dir); return work; out_dput: dput(work); out_err: pr_warn("failed to create directory %s/%s (errno: %i); mounting read-only\n", ofs->config.workdir, name, -err); work = NULL; goto out_unlock; } static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs, const char *name) { struct kstatfs statfs; int err = vfs_statfs(path, &statfs); if (err) pr_err("statfs failed on '%s'\n", name); else ofs->namelen = max(ofs->namelen, statfs.f_namelen); return err; } static int ovl_lower_dir(const char *name, struct path *path, struct ovl_fs *ofs, int *stack_depth) { int fh_type; int err; err = ovl_check_namelen(path, ofs, name); if (err) return err; *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth); /* * The inodes index feature and NFS export need to encode and decode * file handles, so they require that all layers support them. */ fh_type = ovl_can_decode_fh(path->dentry->d_sb); if ((ofs->config.nfs_export || (ofs->config.index && ofs->config.upperdir)) && !fh_type) { ofs->config.index = false; ofs->config.nfs_export = false; pr_warn("fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", name); } ofs->nofh |= !fh_type; /* * Decoding origin file handle is required for persistent st_ino. * Without persistent st_ino, xino=auto falls back to xino=off. */ if (ofs->config.xino == OVL_XINO_AUTO && ofs->config.upperdir && !fh_type) { ofs->config.xino = OVL_XINO_OFF; pr_warn("fs on '%s' does not support file handles, falling back to xino=off.\n", name); } /* Check if lower fs has 32bit inode numbers */ if (fh_type != FILEID_INO32_GEN) ofs->xino_mode = -1; return 0; } /* Workdir should not be subdir of upperdir and vice versa */ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) { bool ok = false; if (workdir != upperdir) { struct dentry *trap = lock_rename(workdir, upperdir); if (!IS_ERR(trap)) unlock_rename(workdir, upperdir); ok = (trap == NULL); } return ok; } static int ovl_setup_trap(struct super_block *sb, struct dentry *dir, struct inode **ptrap, const char *name) { struct inode *trap; int err; trap = ovl_get_trap_inode(sb, dir); err = PTR_ERR_OR_ZERO(trap); if (err) { if (err == -ELOOP) pr_err("conflicting %s path\n", name); return err; } *ptrap = trap; return 0; } /* * Determine how we treat concurrent use of upperdir/workdir based on the * index feature. This is papering over mount leaks of container runtimes, * for example, an old overlay mount is leaked and now its upperdir is * attempted to be used as a lower layer in a new overlay mount. */ static int ovl_report_in_use(struct ovl_fs *ofs, const char *name) { if (ofs->config.index) { pr_err("%s is in-use as upperdir/workdir of another mount, mount with '-o index=off' to override exclusive upperdir protection.\n", name); return -EBUSY; } else { pr_warn("%s is in-use as upperdir/workdir of another mount, accessing files from both mounts will result in undefined behavior.\n", name); return 0; } } static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, struct ovl_layer *upper_layer, const struct path *upperpath) { struct vfsmount *upper_mnt; int err; /* Upperdir path should not be r/o */ if (__mnt_is_readonly(upperpath->mnt)) { pr_err("upper fs is r/o, try multi-lower layers mount\n"); err = -EINVAL; goto out; } err = ovl_check_namelen(upperpath, ofs, ofs->config.upperdir); if (err) goto out; err = ovl_setup_trap(sb, upperpath->dentry, &upper_layer->trap, "upperdir"); if (err) goto out; upper_mnt = clone_private_mount(upperpath); err = PTR_ERR(upper_mnt); if (IS_ERR(upper_mnt)) { pr_err("failed to clone upperpath\n"); goto out; } /* Don't inherit atime flags */ upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME); upper_layer->mnt = upper_mnt; upper_layer->idx = 0; upper_layer->fsid = 0; /* * Inherit SB_NOSEC flag from upperdir. * * This optimization changes behavior when a security related attribute * (suid/sgid/security.*) is changed on an underlying layer. This is * okay because we don't yet have guarantees in that case, but it will * need careful treatment once we want to honour changes to underlying * filesystems. */ if (upper_mnt->mnt_sb->s_flags & SB_NOSEC) sb->s_flags |= SB_NOSEC; if (ovl_inuse_trylock(ovl_upper_mnt(ofs)->mnt_root)) { ofs->upperdir_locked = true; } else { err = ovl_report_in_use(ofs, "upperdir"); if (err) goto out; } err = 0; out: return err; } /* * Returns 1 if RENAME_WHITEOUT is supported, 0 if not supported and * negative values if error is encountered. */ static int ovl_check_rename_whiteout(struct ovl_fs *ofs) { struct dentry *workdir = ofs->workdir; struct inode *dir = d_inode(workdir); struct dentry *temp; struct dentry *dest; struct dentry *whiteout; struct name_snapshot name; int err; inode_lock_nested(dir, I_MUTEX_PARENT); temp = ovl_create_temp(ofs, workdir, OVL_CATTR(S_IFREG | 0)); err = PTR_ERR(temp); if (IS_ERR(temp)) goto out_unlock; dest = ovl_lookup_temp(ofs, workdir); err = PTR_ERR(dest); if (IS_ERR(dest)) { dput(temp); goto out_unlock; } /* Name is inline and stable - using snapshot as a copy helper */ take_dentry_name_snapshot(&name, temp); err = ovl_do_rename(ofs, dir, temp, dir, dest, RENAME_WHITEOUT); if (err) { if (err == -EINVAL) err = 0; goto cleanup_temp; } whiteout = ovl_lookup_upper(ofs, name.name.name, workdir, name.name.len); err = PTR_ERR(whiteout); if (IS_ERR(whiteout)) goto cleanup_temp; err = ovl_upper_is_whiteout(ofs, whiteout); /* Best effort cleanup of whiteout and temp file */ if (err) ovl_cleanup(ofs, dir, whiteout); dput(whiteout); cleanup_temp: ovl_cleanup(ofs, dir, temp); release_dentry_name_snapshot(&name); dput(temp); dput(dest); out_unlock: inode_unlock(dir); return err; } static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs, struct dentry *parent, const char *name, umode_t mode) { size_t len = strlen(name); struct dentry *child; inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); child = ovl_lookup_upper(ofs, name, parent, len); if (!IS_ERR(child) && !child->d_inode) child = ovl_create_real(ofs, parent->d_inode, child, OVL_CATTR(mode)); inode_unlock(parent->d_inode); dput(parent); return child; } /* * Creates $workdir/work/incompat/volatile/dirty file if it is not already * present. */ static int ovl_create_volatile_dirty(struct ovl_fs *ofs) { unsigned int ctr; struct dentry *d = dget(ofs->workbasedir); static const char *const volatile_path[] = { OVL_WORKDIR_NAME, "incompat", "volatile", "dirty" }; const char *const *name = volatile_path; for (ctr = ARRAY_SIZE(volatile_path); ctr; ctr--, name++) { d = ovl_lookup_or_create(ofs, d, *name, ctr > 1 ? S_IFDIR : S_IFREG); if (IS_ERR(d)) return PTR_ERR(d); } dput(d); return 0; } static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, const struct path *workpath) { struct vfsmount *mnt = ovl_upper_mnt(ofs); struct dentry *workdir; struct file *tmpfile; bool rename_whiteout; bool d_type; int fh_type; int err; err = mnt_want_write(mnt); if (err) return err; workdir = ovl_workdir_create(ofs, OVL_WORKDIR_NAME, false); err = PTR_ERR(workdir); if (IS_ERR_OR_NULL(workdir)) goto out; ofs->workdir = workdir; err = ovl_setup_trap(sb, ofs->workdir, &ofs->workdir_trap, "workdir"); if (err) goto out; /* * Upper should support d_type, else whiteouts are visible. Given * workdir and upper are on same fs, we can do iterate_dir() on * workdir. This check requires successful creation of workdir in * previous step. */ err = ovl_check_d_type_supported(workpath); if (err < 0) goto out; d_type = err; if (!d_type) pr_warn("upper fs needs to support d_type.\n"); /* Check if upper/work fs supports O_TMPFILE */ tmpfile = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0); ofs->tmpfile = !IS_ERR(tmpfile); if (ofs->tmpfile) fput(tmpfile); else pr_warn("upper fs does not support tmpfile.\n"); /* Check if upper/work fs supports RENAME_WHITEOUT */ err = ovl_check_rename_whiteout(ofs); if (err < 0) goto out; rename_whiteout = err; if (!rename_whiteout) pr_warn("upper fs does not support RENAME_WHITEOUT.\n"); /* * Check if upper/work fs supports (trusted|user).overlay.* xattr */ err = ovl_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1); if (err) { pr_warn("failed to set xattr on upper\n"); ofs->noxattr = true; if (ovl_redirect_follow(ofs)) { ofs->config.redirect_mode = OVL_REDIRECT_NOFOLLOW; pr_warn("...falling back to redirect_dir=nofollow.\n"); } if (ofs->config.metacopy) { ofs->config.metacopy = false; pr_warn("...falling back to metacopy=off.\n"); } if (ofs->config.index) { ofs->config.index = false; pr_warn("...falling back to index=off.\n"); } if (ovl_has_fsid(ofs)) { ofs->config.uuid = OVL_UUID_NULL; pr_warn("...falling back to uuid=null.\n"); } /* * xattr support is required for persistent st_ino. * Without persistent st_ino, xino=auto falls back to xino=off. */ if (ofs->config.xino == OVL_XINO_AUTO) { ofs->config.xino = OVL_XINO_OFF; pr_warn("...falling back to xino=off.\n"); } if (err == -EPERM && !ofs->config.userxattr) pr_info("try mounting with 'userxattr' option\n"); err = 0; } else { ovl_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE); } /* * We allowed sub-optimal upper fs configuration and don't want to break * users over kernel upgrade, but we never allowed remote upper fs, so * we can enforce strict requirements for remote upper fs. */ if (ovl_dentry_remote(ofs->workdir) && (!d_type || !rename_whiteout || ofs->noxattr)) { pr_err("upper fs missing required features.\n"); err = -EINVAL; goto out; } /* * For volatile mount, create a incompat/volatile/dirty file to keep * track of it. */ if (ofs->config.ovl_volatile) { err = ovl_create_volatile_dirty(ofs); if (err < 0) { pr_err("Failed to create volatile/dirty file.\n"); goto out; } } /* Check if upper/work fs supports file handles */ fh_type = ovl_can_decode_fh(ofs->workdir->d_sb); if (ofs->config.index && !fh_type) { ofs->config.index = false; pr_warn("upper fs does not support file handles, falling back to index=off.\n"); } ofs->nofh |= !fh_type; /* Check if upper fs has 32bit inode numbers */ if (fh_type != FILEID_INO32_GEN) ofs->xino_mode = -1; /* NFS export of r/w mount depends on index */ if (ofs->config.nfs_export && !ofs->config.index) { pr_warn("NFS export requires \"index=on\", falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } out: mnt_drop_write(mnt); return err; } static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs, const struct path *upperpath, const struct path *workpath) { int err; err = -EINVAL; if (upperpath->mnt != workpath->mnt) { pr_err("workdir and upperdir must reside under the same mount\n"); return err; } if (!ovl_workdir_ok(workpath->dentry, upperpath->dentry)) { pr_err("workdir and upperdir must be separate subtrees\n"); return err; } ofs->workbasedir = dget(workpath->dentry); if (ovl_inuse_trylock(ofs->workbasedir)) { ofs->workdir_locked = true; } else { err = ovl_report_in_use(ofs, "workdir"); if (err) return err; } err = ovl_setup_trap(sb, ofs->workbasedir, &ofs->workbasedir_trap, "workdir"); if (err) return err; return ovl_make_workdir(sb, ofs, workpath); } static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, struct ovl_entry *oe, const struct path *upperpath) { struct vfsmount *mnt = ovl_upper_mnt(ofs); struct dentry *indexdir; struct dentry *origin = ovl_lowerstack(oe)->dentry; const struct ovl_fh *fh; int err; fh = ovl_get_origin_fh(ofs, origin); if (IS_ERR(fh)) return PTR_ERR(fh); err = mnt_want_write(mnt); if (err) goto out_free_fh; /* Verify lower root is upper root origin */ err = ovl_verify_origin_fh(ofs, upperpath->dentry, fh, true); if (err) { pr_err("failed to verify upper root origin\n"); goto out; } /* index dir will act also as workdir */ iput(ofs->workdir_trap); ofs->workdir_trap = NULL; dput(ofs->workdir); ofs->workdir = NULL; indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true); if (IS_ERR(indexdir)) { err = PTR_ERR(indexdir); } else if (indexdir) { ofs->workdir = indexdir; err = ovl_setup_trap(sb, indexdir, &ofs->workdir_trap, "indexdir"); if (err) goto out; /* * Verify upper root is exclusively associated with index dir. * Older kernels stored upper fh in ".overlay.origin" * xattr. If that xattr exists, verify that it is a match to * upper dir file handle. In any case, verify or set xattr * ".overlay.upper" to indicate that index may have * directory entries. */ if (ovl_check_origin_xattr(ofs, indexdir)) { err = ovl_verify_origin_xattr(ofs, indexdir, OVL_XATTR_ORIGIN, upperpath->dentry, true, false); if (err) pr_err("failed to verify index dir 'origin' xattr\n"); } err = ovl_verify_upper(ofs, indexdir, upperpath->dentry, true); if (err) pr_err("failed to verify index dir 'upper' xattr\n"); /* Cleanup bad/stale/orphan index entries */ if (!err) err = ovl_indexdir_cleanup(ofs); } if (err || !indexdir) pr_warn("try deleting index dir or mounting with '-o index=off' to disable inodes index.\n"); out: mnt_drop_write(mnt); out_free_fh: kfree(fh); return err; } static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) { unsigned int i; if (!ofs->config.nfs_export && !ovl_upper_mnt(ofs)) return true; /* * We allow using single lower with null uuid for index and nfs_export * for example to support those features with single lower squashfs. * To avoid regressions in setups of overlay with re-formatted lower * squashfs, do not allow decoding origin with lower null uuid unless * user opted-in to one of the new features that require following the * lower inode of non-dir upper. */ if (ovl_allow_offline_changes(ofs) && uuid_is_null(uuid)) return false; for (i = 0; i < ofs->numfs; i++) { /* * We use uuid to associate an overlay lower file handle with a * lower layer, so we can accept lower fs with null uuid as long * as all lower layers with null uuid are on the same fs. * if we detect multiple lower fs with the same uuid, we * disable lower file handle decoding on all of them. */ if (ofs->fs[i].is_lower && uuid_equal(&ofs->fs[i].sb->s_uuid, uuid)) { ofs->fs[i].bad_uuid = true; return false; } } return true; } /* Get a unique fsid for the layer */ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) { struct super_block *sb = path->mnt->mnt_sb; unsigned int i; dev_t dev; int err; bool bad_uuid = false; bool warn = false; for (i = 0; i < ofs->numfs; i++) { if (ofs->fs[i].sb == sb) return i; } if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) { bad_uuid = true; if (ofs->config.xino == OVL_XINO_AUTO) { ofs->config.xino = OVL_XINO_OFF; warn = true; } if (ofs->config.index || ofs->config.nfs_export) { ofs->config.index = false; ofs->config.nfs_export = false; warn = true; } if (warn) { pr_warn("%s uuid detected in lower fs '%pd2', falling back to xino=%s,index=off,nfs_export=off.\n", uuid_is_null(&sb->s_uuid) ? "null" : "conflicting", path->dentry, ovl_xino_mode(&ofs->config)); } } err = get_anon_bdev(&dev); if (err) { pr_err("failed to get anonymous bdev for lowerpath\n"); return err; } ofs->fs[ofs->numfs].sb = sb; ofs->fs[ofs->numfs].pseudo_dev = dev; ofs->fs[ofs->numfs].bad_uuid = bad_uuid; return ofs->numfs++; } /* * The fsid after the last lower fsid is used for the data layers. * It is a "null fs" with a null sb, null uuid, and no pseudo dev. */ static int ovl_get_data_fsid(struct ovl_fs *ofs) { return ofs->numfs; } static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, struct ovl_fs_context *ctx, struct ovl_layer *layers) { int err; unsigned int i; size_t nr_merged_lower; ofs->fs = kcalloc(ctx->nr + 2, sizeof(struct ovl_sb), GFP_KERNEL); if (ofs->fs == NULL) return -ENOMEM; /* * idx/fsid 0 are reserved for upper fs even with lower only overlay * and the last fsid is reserved for "null fs" of the data layers. */ ofs->numfs++; /* * All lower layers that share the same fs as upper layer, use the same * pseudo_dev as upper layer. Allocate fs[0].pseudo_dev even for lower * only overlay to simplify ovl_fs_free(). * is_lower will be set if upper fs is shared with a lower layer. */ err = get_anon_bdev(&ofs->fs[0].pseudo_dev); if (err) { pr_err("failed to get anonymous bdev for upper fs\n"); return err; } if (ovl_upper_mnt(ofs)) { ofs->fs[0].sb = ovl_upper_mnt(ofs)->mnt_sb; ofs->fs[0].is_lower = false; } nr_merged_lower = ctx->nr - ctx->nr_data; for (i = 0; i < ctx->nr; i++) { struct ovl_fs_context_layer *l = &ctx->lower[i]; struct vfsmount *mnt; struct inode *trap; int fsid; if (i < nr_merged_lower) fsid = ovl_get_fsid(ofs, &l->path); else fsid = ovl_get_data_fsid(ofs); if (fsid < 0) return fsid; /* * Check if lower root conflicts with this overlay layers before * checking if it is in-use as upperdir/workdir of "another" * mount, because we do not bother to check in ovl_is_inuse() if * the upperdir/workdir is in fact in-use by our * upperdir/workdir. */ err = ovl_setup_trap(sb, l->path.dentry, &trap, "lowerdir"); if (err) return err; if (ovl_is_inuse(l->path.dentry)) { err = ovl_report_in_use(ofs, "lowerdir"); if (err) { iput(trap); return err; } } mnt = clone_private_mount(&l->path); err = PTR_ERR(mnt); if (IS_ERR(mnt)) { pr_err("failed to clone lowerpath\n"); iput(trap); return err; } /* * Make lower layers R/O. That way fchmod/fchown on lower file * will fail instead of modifying lower fs. */ mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME; layers[ofs->numlayer].trap = trap; layers[ofs->numlayer].mnt = mnt; layers[ofs->numlayer].idx = ofs->numlayer; layers[ofs->numlayer].fsid = fsid; layers[ofs->numlayer].fs = &ofs->fs[fsid]; /* Store for printing lowerdir=... in ovl_show_options() */ ofs->config.lowerdirs[ofs->numlayer] = l->name; l->name = NULL; ofs->numlayer++; ofs->fs[fsid].is_lower = true; } /* * When all layers on same fs, overlay can use real inode numbers. * With mount option "xino=<on|auto>", mounter declares that there are * enough free high bits in underlying fs to hold the unique fsid. * If overlayfs does encounter underlying inodes using the high xino * bits reserved for fsid, it emits a warning and uses the original * inode number or a non persistent inode number allocated from a * dedicated range. */ if (ofs->numfs - !ovl_upper_mnt(ofs) == 1) { if (ofs->config.xino == OVL_XINO_ON) pr_info("\"xino=on\" is useless with all layers on same fs, ignore.\n"); ofs->xino_mode = 0; } else if (ofs->config.xino == OVL_XINO_OFF) { ofs->xino_mode = -1; } else if (ofs->xino_mode < 0) { /* * This is a roundup of number of bits needed for encoding * fsid, where fsid 0 is reserved for upper fs (even with * lower only overlay) +1 extra bit is reserved for the non * persistent inode number range that is used for resolving * xino lower bits overflow. */ BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 30); ofs->xino_mode = ilog2(ofs->numfs - 1) + 2; } if (ofs->xino_mode > 0) { pr_info("\"xino\" feature enabled using %d upper inode bits.\n", ofs->xino_mode); } return 0; } static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, struct ovl_fs_context *ctx, struct ovl_fs *ofs, struct ovl_layer *layers) { int err; unsigned int i; size_t nr_merged_lower; struct ovl_entry *oe; struct ovl_path *lowerstack; struct ovl_fs_context_layer *l; if (!ofs->config.upperdir && ctx->nr == 1) { pr_err("at least 2 lowerdir are needed while upperdir nonexistent\n"); return ERR_PTR(-EINVAL); } err = -EINVAL; for (i = 0; i < ctx->nr; i++) { l = &ctx->lower[i]; err = ovl_lower_dir(l->name, &l->path, ofs, &sb->s_stack_depth); if (err) return ERR_PTR(err); } err = -EINVAL; sb->s_stack_depth++; if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { pr_err("maximum fs stacking depth exceeded\n"); return ERR_PTR(err); } err = ovl_get_layers(sb, ofs, ctx, layers); if (err) return ERR_PTR(err); err = -ENOMEM; /* Data-only layers are not merged in root directory */ nr_merged_lower = ctx->nr - ctx->nr_data; oe = ovl_alloc_entry(nr_merged_lower); if (!oe) return ERR_PTR(err); lowerstack = ovl_lowerstack(oe); for (i = 0; i < nr_merged_lower; i++) { l = &ctx->lower[i]; lowerstack[i].dentry = dget(l->path.dentry); lowerstack[i].layer = &ofs->layers[i + 1]; } ofs->numdatalayer = ctx->nr_data; return oe; } /* * Check if this layer root is a descendant of: * - another layer of this overlayfs instance * - upper/work dir of any overlayfs instance */ static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs, struct dentry *dentry, const char *name, bool is_lower) { struct dentry *next = dentry, *parent; int err = 0; if (!dentry) return 0; parent = dget_parent(next); /* Walk back ancestors to root (inclusive) looking for traps */ while (!err && parent != next) { if (is_lower && ovl_lookup_trap_inode(sb, parent)) { err = -ELOOP; pr_err("overlapping %s path\n", name); } else if (ovl_is_inuse(parent)) { err = ovl_report_in_use(ofs, name); } next = parent; parent = dget_parent(next); dput(next); } dput(parent); return err; } /* * Check if any of the layers or work dirs overlap. */ static int ovl_check_overlapping_layers(struct super_block *sb, struct ovl_fs *ofs) { int i, err; if (ovl_upper_mnt(ofs)) { err = ovl_check_layer(sb, ofs, ovl_upper_mnt(ofs)->mnt_root, "upperdir", false); if (err) return err; /* * Checking workbasedir avoids hitting ovl_is_inuse(parent) of * this instance and covers overlapping work and index dirs, * unless work or index dir have been moved since created inside * workbasedir. In that case, we already have their traps in * inode cache and we will catch that case on lookup. */ err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir", false); if (err) return err; } for (i = 1; i < ofs->numlayer; i++) { err = ovl_check_layer(sb, ofs, ofs->layers[i].mnt->mnt_root, "lowerdir", true); if (err) return err; } return 0; } static struct dentry *ovl_get_root(struct super_block *sb, struct dentry *upperdentry, struct ovl_entry *oe) { struct dentry *root; struct ovl_fs *ofs = OVL_FS(sb); struct ovl_path *lowerpath = ovl_lowerstack(oe); unsigned long ino = d_inode(lowerpath->dentry)->i_ino; int fsid = lowerpath->layer->fsid; struct ovl_inode_params oip = { .upperdentry = upperdentry, .oe = oe, }; root = d_make_root(ovl_new_inode(sb, S_IFDIR, 0)); if (!root) return NULL; if (upperdentry) { /* Root inode uses upper st_ino/i_ino */ ino = d_inode(upperdentry)->i_ino; fsid = 0; ovl_dentry_set_upper_alias(root); if (ovl_is_impuredir(sb, upperdentry)) ovl_set_flag(OVL_IMPURE, d_inode(root)); } /* Look for xwhiteouts marker except in the lowermost layer */ for (int i = 0; i < ovl_numlower(oe) - 1; i++, lowerpath++) { struct path path = { .mnt = lowerpath->layer->mnt, .dentry = lowerpath->dentry, }; /* overlay.opaque=x means xwhiteouts directory */ if (ovl_get_opaquedir_val(ofs, &path) == 'x') { ovl_layer_set_xwhiteouts(ofs, lowerpath->layer); ovl_dentry_set_xwhiteouts(root); } } /* Root is always merge -> can have whiteouts */ ovl_set_flag(OVL_WHITEOUTS, d_inode(root)); ovl_dentry_set_flag(OVL_E_CONNECTED, root); ovl_set_upperdata(d_inode(root)); ovl_inode_init(d_inode(root), &oip, ino, fsid); ovl_dentry_init_flags(root, upperdentry, oe, DCACHE_OP_WEAK_REVALIDATE); /* root keeps a reference of upperdentry */ dget(upperdentry); return root; } int ovl_fill_super(struct super_block *sb, struct fs_context *fc) { struct ovl_fs *ofs = sb->s_fs_info; struct ovl_fs_context *ctx = fc->fs_private; struct dentry *root_dentry; struct ovl_entry *oe; struct ovl_layer *layers; struct cred *cred; int err; err = -EIO; if (WARN_ON(fc->user_ns != current_user_ns())) goto out_err; sb->s_d_op = &ovl_dentry_operations; err = -ENOMEM; ofs->creator_cred = cred = prepare_creds(); if (!cred) goto out_err; err = ovl_fs_params_verify(ctx, &ofs->config); if (err) goto out_err; err = -EINVAL; if (ctx->nr == 0) { if (!(fc->sb_flags & SB_SILENT)) pr_err("missing 'lowerdir'\n"); goto out_err; } err = -ENOMEM; layers = kcalloc(ctx->nr + 1, sizeof(struct ovl_layer), GFP_KERNEL); if (!layers) goto out_err; ofs->config.lowerdirs = kcalloc(ctx->nr + 1, sizeof(char *), GFP_KERNEL); if (!ofs->config.lowerdirs) { kfree(layers); goto out_err; } ofs->layers = layers; /* * Layer 0 is reserved for upper even if there's no upper. * config.lowerdirs[0] is used for storing the user provided colon * separated lowerdir string. */ ofs->config.lowerdirs[0] = ctx->lowerdir_all; ctx->lowerdir_all = NULL; ofs->numlayer = 1; sb->s_stack_depth = 0; sb->s_maxbytes = MAX_LFS_FILESIZE; atomic_long_set(&ofs->last_ino, 1); /* Assume underlying fs uses 32bit inodes unless proven otherwise */ if (ofs->config.xino != OVL_XINO_OFF) { ofs->xino_mode = BITS_PER_LONG - 32; if (!ofs->xino_mode) { pr_warn("xino not supported on 32bit kernel, falling back to xino=off.\n"); ofs->config.xino = OVL_XINO_OFF; } } /* alloc/destroy_inode needed for setting up traps in inode cache */ sb->s_op = &ovl_super_operations; if (ofs->config.upperdir) { struct super_block *upper_sb; err = -EINVAL; if (!ofs->config.workdir) { pr_err("missing 'workdir'\n"); goto out_err; } err = ovl_get_upper(sb, ofs, &layers[0], &ctx->upper); if (err) goto out_err; upper_sb = ovl_upper_mnt(ofs)->mnt_sb; if (!ovl_should_sync(ofs)) { ofs->errseq = errseq_sample(&upper_sb->s_wb_err); if (errseq_check(&upper_sb->s_wb_err, ofs->errseq)) { err = -EIO; pr_err("Cannot mount volatile when upperdir has an unseen error. Sync upperdir fs to clear state.\n"); goto out_err; } } err = ovl_get_workdir(sb, ofs, &ctx->upper, &ctx->work); if (err) goto out_err; if (!ofs->workdir) sb->s_flags |= SB_RDONLY; sb->s_stack_depth = upper_sb->s_stack_depth; sb->s_time_gran = upper_sb->s_time_gran; } oe = ovl_get_lowerstack(sb, ctx, ofs, layers); err = PTR_ERR(oe); if (IS_ERR(oe)) goto out_err; /* If the upper fs is nonexistent, we mark overlayfs r/o too */ if (!ovl_upper_mnt(ofs)) sb->s_flags |= SB_RDONLY; if (!ovl_origin_uuid(ofs) && ofs->numfs > 1) { pr_warn("The uuid=off requires a single fs for lower and upper, falling back to uuid=null.\n"); ofs->config.uuid = OVL_UUID_NULL; } else if (ovl_has_fsid(ofs) && ovl_upper_mnt(ofs)) { /* Use per instance persistent uuid/fsid */ ovl_init_uuid_xattr(sb, ofs, &ctx->upper); } if (!ovl_force_readonly(ofs) && ofs->config.index) { err = ovl_get_indexdir(sb, ofs, oe, &ctx->upper); if (err) goto out_free_oe; /* Force r/o mount with no index dir */ if (!ofs->workdir) sb->s_flags |= SB_RDONLY; } err = ovl_check_overlapping_layers(sb, ofs); if (err) goto out_free_oe; /* Show index=off in /proc/mounts for forced r/o mount */ if (!ofs->workdir) { ofs->config.index = false; if (ovl_upper_mnt(ofs) && ofs->config.nfs_export) { pr_warn("NFS export requires an index dir, falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } } if (ofs->config.metacopy && ofs->config.nfs_export) { pr_warn("NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } /* * Support encoding decodable file handles with nfs_export=on * and encoding non-decodable file handles with nfs_export=off * if all layers support file handles. */ if (ofs->config.nfs_export) sb->s_export_op = &ovl_export_operations; else if (!ofs->nofh) sb->s_export_op = &ovl_export_fid_operations; /* Never override disk quota limits or use reserved space */ cap_lower(cred->cap_effective, CAP_SYS_RESOURCE); sb->s_magic = OVERLAYFS_SUPER_MAGIC; sb->s_xattr = ovl_xattr_handlers(ofs); sb->s_fs_info = ofs; #ifdef CONFIG_FS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif sb->s_iflags |= SB_I_SKIP_SYNC; /* * Ensure that umask handling is done by the filesystems used * for the the upper layer instead of overlayfs as that would * lead to unexpected results. */ sb->s_iflags |= SB_I_NOUMASK; sb->s_iflags |= SB_I_EVM_UNSUPPORTED; err = -ENOMEM; root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe); if (!root_dentry) goto out_free_oe; sb->s_root = root_dentry; return 0; out_free_oe: ovl_free_entry(oe); out_err: ovl_free_fs(ofs); sb->s_fs_info = NULL; return err; } struct file_system_type ovl_fs_type = { .owner = THIS_MODULE, .name = "overlay", .init_fs_context = ovl_init_fs_context, .parameters = ovl_parameter_spec, .fs_flags = FS_USERNS_MOUNT, .kill_sb = kill_anon_super, }; MODULE_ALIAS_FS("overlay"); static void ovl_inode_init_once(void *foo) { struct ovl_inode *oi = foo; inode_init_once(&oi->vfs_inode); } static int __init ovl_init(void) { int err; ovl_inode_cachep = kmem_cache_create("ovl_inode", sizeof(struct ovl_inode), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT), ovl_inode_init_once); if (ovl_inode_cachep == NULL) return -ENOMEM; err = register_filesystem(&ovl_fs_type); if (!err) return 0; kmem_cache_destroy(ovl_inode_cachep); return err; } static void __exit ovl_exit(void) { unregister_filesystem(&ovl_fs_type); /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(ovl_inode_cachep); } module_init(ovl_init); module_exit(ovl_exit);
57 57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 /* SPDX-License-Identifier: GPL-2.0-only */ /* * kernfs.h - pseudo filesystem decoupled from vfs locking */ #ifndef __LINUX_KERNFS_H #define __LINUX_KERNFS_H #include <linux/err.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/idr.h> #include <linux/lockdep.h> #include <linux/rbtree.h> #include <linux/atomic.h> #include <linux/bug.h> #include <linux/types.h> #include <linux/uidgid.h> #include <linux/wait.h> #include <linux/rwsem.h> #include <linux/cache.h> struct file; struct dentry; struct iattr; struct seq_file; struct vm_area_struct; struct vm_operations_struct; struct super_block; struct file_system_type; struct poll_table_struct; struct fs_context; struct kernfs_fs_context; struct kernfs_open_node; struct kernfs_iattrs; /* * NR_KERNFS_LOCK_BITS determines size (NR_KERNFS_LOCKS) of hash * table of locks. * Having a small hash table would impact scalability, since * more and more kernfs_node objects will end up using same lock * and having a very large hash table would waste memory. * * At the moment size of hash table of locks is being set based on * the number of CPUs as follows: * * NR_CPU NR_KERNFS_LOCK_BITS NR_KERNFS_LOCKS * 1 1 2 * 2-3 2 4 * 4-7 4 16 * 8-15 6 64 * 16-31 8 256 * 32 and more 10 1024 * * The above relation between NR_CPU and number of locks is based * on some internal experimentation which involved booting qemu * with different values of smp, performing some sysfs operations * on all CPUs and observing how increase in number of locks impacts * completion time of these sysfs operations on each CPU. */ #ifdef CONFIG_SMP #define NR_KERNFS_LOCK_BITS (2 * (ilog2(NR_CPUS < 32 ? NR_CPUS : 32))) #else #define NR_KERNFS_LOCK_BITS 1 #endif #define NR_KERNFS_LOCKS (1 << NR_KERNFS_LOCK_BITS) /* * There's one kernfs_open_file for each open file and one kernfs_open_node * for each kernfs_node with one or more open files. * * filp->private_data points to seq_file whose ->private points to * kernfs_open_file. * * kernfs_open_files are chained at kernfs_open_node->files, which is * protected by kernfs_global_locks.open_file_mutex[i]. * * To reduce possible contention in sysfs access, arising due to single * locks, use an array of locks (e.g. open_file_mutex) and use kernfs_node * object address as hash keys to get the index of these locks. * * Hashed mutexes are safe to use here because operations using these don't * rely on global exclusion. * * In future we intend to replace other global locks with hashed ones as well. * kernfs_global_locks acts as a holder for all such hash tables. */ struct kernfs_global_locks { struct mutex open_file_mutex[NR_KERNFS_LOCKS]; }; enum kernfs_node_type { KERNFS_DIR = 0x0001, KERNFS_FILE = 0x0002, KERNFS_LINK = 0x0004, }; #define KERNFS_TYPE_MASK 0x000f #define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK #define KERNFS_MAX_USER_XATTRS 128 #define KERNFS_USER_XATTR_SIZE_LIMIT (128 << 10) enum kernfs_node_flag { KERNFS_ACTIVATED = 0x0010, KERNFS_NS = 0x0020, KERNFS_HAS_SEQ_SHOW = 0x0040, KERNFS_HAS_MMAP = 0x0080, KERNFS_LOCKDEP = 0x0100, KERNFS_HIDDEN = 0x0200, KERNFS_SUICIDAL = 0x0400, KERNFS_SUICIDED = 0x0800, KERNFS_EMPTY_DIR = 0x1000, KERNFS_HAS_RELEASE = 0x2000, KERNFS_REMOVING = 0x4000, }; /* @flags for kernfs_create_root() */ enum kernfs_root_flag { /* * kernfs_nodes are created in the deactivated state and invisible. * They require explicit kernfs_activate() to become visible. This * can be used to make related nodes become visible atomically * after all nodes are created successfully. */ KERNFS_ROOT_CREATE_DEACTIVATED = 0x0001, /* * For regular files, if the opener has CAP_DAC_OVERRIDE, open(2) * succeeds regardless of the RW permissions. sysfs had an extra * layer of enforcement where open(2) fails with -EACCES regardless * of CAP_DAC_OVERRIDE if the permission doesn't have the * respective read or write access at all (none of S_IRUGO or * S_IWUGO) or the respective operation isn't implemented. The * following flag enables that behavior. */ KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK = 0x0002, /* * The filesystem supports exportfs operation, so userspace can use * fhandle to access nodes of the fs. */ KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004, /* * Support user xattrs to be written to nodes rooted at this root. */ KERNFS_ROOT_SUPPORT_USER_XATTR = 0x0008, }; /* type-specific structures for kernfs_node union members */ struct kernfs_elem_dir { unsigned long subdirs; /* children rbtree starts here and goes through kn->rb */ struct rb_root children; /* * The kernfs hierarchy this directory belongs to. This fits * better directly in kernfs_node but is here to save space. */ struct kernfs_root *root; /* * Monotonic revision counter, used to identify if a directory * node has changed during negative dentry revalidation. */ unsigned long rev; }; struct kernfs_elem_symlink { struct kernfs_node *target_kn; }; struct kernfs_elem_attr { const struct kernfs_ops *ops; struct kernfs_open_node __rcu *open; loff_t size; struct kernfs_node *notify_next; /* for kernfs_notify() */ }; /* * kernfs_node - the building block of kernfs hierarchy. Each and every * kernfs node is represented by single kernfs_node. Most fields are * private to kernfs and shouldn't be accessed directly by kernfs users. * * As long as count reference is held, the kernfs_node itself is * accessible. Dereferencing elem or any other outer entity requires * active reference. */ struct kernfs_node { atomic_t count; atomic_t active; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif /* * Use kernfs_get_parent() and kernfs_name/path() instead of * accessing the following two fields directly. If the node is * never moved to a different parent, it is safe to access the * parent directly. */ struct kernfs_node *parent; const char *name; struct rb_node rb; const void *ns; /* namespace tag */ unsigned int hash; /* ns + name hash */ union { struct kernfs_elem_dir dir; struct kernfs_elem_symlink symlink; struct kernfs_elem_attr attr; }; void *priv; /* * 64bit unique ID. On 64bit ino setups, id is the ino. On 32bit, * the low 32bits are ino and upper generation. */ u64 id; unsigned short flags; umode_t mode; struct kernfs_iattrs *iattr; }; /* * kernfs_syscall_ops may be specified on kernfs_create_root() to support * syscalls. These optional callbacks are invoked on the matching syscalls * and can perform any kernfs operations which don't necessarily have to be * the exact operation requested. An active reference is held for each * kernfs_node parameter. */ struct kernfs_syscall_ops { int (*show_options)(struct seq_file *sf, struct kernfs_root *root); int (*mkdir)(struct kernfs_node *parent, const char *name, umode_t mode); int (*rmdir)(struct kernfs_node *kn); int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name); int (*show_path)(struct seq_file *sf, struct kernfs_node *kn, struct kernfs_root *root); }; struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root); struct kernfs_open_file { /* published fields */ struct kernfs_node *kn; struct file *file; struct seq_file *seq_file; void *priv; /* private fields, do not use outside kernfs proper */ struct mutex mutex; struct mutex prealloc_mutex; int event; struct list_head list; char *prealloc_buf; size_t atomic_write_len; bool mmapped:1; bool released:1; const struct vm_operations_struct *vm_ops; }; struct kernfs_ops { /* * Optional open/release methods. Both are called with * @of->seq_file populated. */ int (*open)(struct kernfs_open_file *of); void (*release)(struct kernfs_open_file *of); /* * Read is handled by either seq_file or raw_read(). * * If seq_show() is present, seq_file path is active. Other seq * operations are optional and if not implemented, the behavior is * equivalent to single_open(). @sf->private points to the * associated kernfs_open_file. * * read() is bounced through kernel buffer and a read larger than * PAGE_SIZE results in partial operation of PAGE_SIZE. */ int (*seq_show)(struct seq_file *sf, void *v); void *(*seq_start)(struct seq_file *sf, loff_t *ppos); void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); void (*seq_stop)(struct seq_file *sf, void *v); ssize_t (*read)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off); /* * write() is bounced through kernel buffer. If atomic_write_len * is not set, a write larger than PAGE_SIZE results in partial * operations of PAGE_SIZE chunks. If atomic_write_len is set, * writes upto the specified size are executed atomically but * larger ones are rejected with -E2BIG. */ size_t atomic_write_len; /* * "prealloc" causes a buffer to be allocated at open for * all read/write requests. As ->seq_show uses seq_read() * which does its own allocation, it is incompatible with * ->prealloc. Provide ->read and ->write with ->prealloc. */ bool prealloc; ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off); __poll_t (*poll)(struct kernfs_open_file *of, struct poll_table_struct *pt); int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma); loff_t (*llseek)(struct kernfs_open_file *of, loff_t offset, int whence); }; /* * The kernfs superblock creation/mount parameter context. */ struct kernfs_fs_context { struct kernfs_root *root; /* Root of the hierarchy being mounted */ void *ns_tag; /* Namespace tag of the mount (or NULL) */ unsigned long magic; /* File system specific magic number */ /* The following are set/used by kernfs_mount() */ bool new_sb_created; /* Set to T if we allocated a new sb */ }; #ifdef CONFIG_KERNFS static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) { return kn->flags & KERNFS_TYPE_MASK; } static inline ino_t kernfs_id_ino(u64 id) { /* id is ino if ino_t is 64bit; otherwise, low 32bits */ if (sizeof(ino_t) >= sizeof(u64)) return id; else return (u32)id; } static inline u32 kernfs_id_gen(u64 id) { /* gen is fixed at 1 if ino_t is 64bit; otherwise, high 32bits */ if (sizeof(ino_t) >= sizeof(u64)) return 1; else return id >> 32; } static inline ino_t kernfs_ino(struct kernfs_node *kn) { return kernfs_id_ino(kn->id); } static inline ino_t kernfs_gen(struct kernfs_node *kn) { return kernfs_id_gen(kn->id); } /** * kernfs_enable_ns - enable namespace under a directory * @kn: directory of interest, should be empty * * This is to be called right after @kn is created to enable namespace * under it. All children of @kn must have non-NULL namespace tags and * only the ones which match the super_block's tag will be visible. */ static inline void kernfs_enable_ns(struct kernfs_node *kn) { WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR); WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children)); kn->flags |= KERNFS_NS; } /** * kernfs_ns_enabled - test whether namespace is enabled * @kn: the node to test * * Test whether namespace filtering is enabled for the children of @ns. */ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) { return kn->flags & KERNFS_NS; } int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, char *buf, size_t buflen); void pr_cont_kernfs_name(struct kernfs_node *kn); void pr_cont_kernfs_path(struct kernfs_node *kn); struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn); struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns); struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns); void kernfs_get(struct kernfs_node *kn); void kernfs_put(struct kernfs_node *kn); struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry); struct kernfs_root *kernfs_root_from_sb(struct super_block *sb); struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); struct dentry *kernfs_node_dentry(struct kernfs_node *kn, struct super_block *sb); struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv); void kernfs_destroy_root(struct kernfs_root *root); struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const void *ns); struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name); struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key); struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target); void kernfs_activate(struct kernfs_node *kn); void kernfs_show(struct kernfs_node *kn, bool show); void kernfs_remove(struct kernfs_node *kn); void kernfs_break_active_protection(struct kernfs_node *kn); void kernfs_unbreak_active_protection(struct kernfs_node *kn); bool kernfs_remove_self(struct kernfs_node *kn); int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns); int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns); int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); __poll_t kernfs_generic_poll(struct kernfs_open_file *of, struct poll_table_struct *pt); void kernfs_notify(struct kernfs_node *kn); int kernfs_xattr_get(struct kernfs_node *kn, const char *name, void *value, size_t size); int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags); const void *kernfs_super_ns(struct super_block *sb); int kernfs_get_tree(struct fs_context *fc); void kernfs_free_fs_context(struct fs_context *fc); void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, u64 id); #else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) { return 0; } /* whatever */ static inline void kernfs_enable_ns(struct kernfs_node *kn) { } static inline bool kernfs_ns_enabled(struct kernfs_node *kn) { return false; } static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { return -ENOSYS; } static inline int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, char *buf, size_t buflen) { return -ENOSYS; } static inline void pr_cont_kernfs_name(struct kernfs_node *kn) { } static inline void pr_cont_kernfs_path(struct kernfs_node *kn) { } static inline struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) { return NULL; } static inline struct kernfs_node * kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { return NULL; } static inline struct kernfs_node * kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) { return NULL; } static inline void kernfs_get(struct kernfs_node *kn) { } static inline void kernfs_put(struct kernfs_node *kn) { } static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { return NULL; } static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) { return NULL; } static inline struct inode * kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { return NULL; } static inline struct kernfs_root * kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv) { return ERR_PTR(-ENOSYS); } static inline void kernfs_destroy_root(struct kernfs_root *root) { } static inline struct kernfs_node * kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const void *ns) { return ERR_PTR(-ENOSYS); } static inline struct kernfs_node * __kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) { return ERR_PTR(-ENOSYS); } static inline struct kernfs_node * kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target) { return ERR_PTR(-ENOSYS); } static inline void kernfs_activate(struct kernfs_node *kn) { } static inline void kernfs_remove(struct kernfs_node *kn) { } static inline bool kernfs_remove_self(struct kernfs_node *kn) { return false; } static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn, const char *name, const void *ns) { return -ENOSYS; } static inline int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { return -ENOSYS; } static inline int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { return -ENOSYS; } static inline __poll_t kernfs_generic_poll(struct kernfs_open_file *of, struct poll_table_struct *pt) { return -ENOSYS; } static inline void kernfs_notify(struct kernfs_node *kn) { } static inline int kernfs_xattr_get(struct kernfs_node *kn, const char *name, void *value, size_t size) { return -ENOSYS; } static inline int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags) { return -ENOSYS; } static inline const void *kernfs_super_ns(struct super_block *sb) { return NULL; } static inline int kernfs_get_tree(struct fs_context *fc) { return -ENOSYS; } static inline void kernfs_free_fs_context(struct fs_context *fc) { } static inline void kernfs_kill_sb(struct super_block *sb) { } static inline void kernfs_init(void) { } #endif /* CONFIG_KERNFS */ /** * kernfs_path - build full path of a given node * @kn: kernfs_node of interest * @buf: buffer to copy @kn's name into * @buflen: size of @buf * * If @kn is NULL result will be "(null)". * * Returns the length of the full path. If the full length is equal to or * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ static inline int kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) { return kernfs_path_from_node(kn, NULL, buf, buflen); } static inline struct kernfs_node * kernfs_find_and_get(struct kernfs_node *kn, const char *name) { return kernfs_find_and_get_ns(kn, name, NULL); } static inline struct kernfs_node * kernfs_walk_and_get(struct kernfs_node *kn, const char *path) { return kernfs_walk_and_get_ns(kn, path, NULL); } static inline struct kernfs_node * kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode, void *priv) { return kernfs_create_dir_ns(parent, name, mode, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, priv, NULL); } static inline int kernfs_remove_by_name(struct kernfs_node *parent, const char *name) { return kernfs_remove_by_name_ns(parent, name, NULL); } static inline int kernfs_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name) { return kernfs_rename_ns(kn, new_parent, new_name, NULL); } #endif /* __LINUX_KERNFS_H */
29 29 29 562 563 563 562 20 29 563 562 562 563 543 20 562 557 556 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 // SPDX-License-Identifier: GPL-2.0 /* * USB device quirk handling logic and table * * Copyright (c) 2007 Oliver Neukum * Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de> */ #include <linux/moduleparam.h> #include <linux/usb.h> #include <linux/usb/quirks.h> #include <linux/usb/hcd.h> #include "usb.h" struct quirk_entry { u16 vid; u16 pid; u32 flags; }; static DEFINE_MUTEX(quirk_mutex); static struct quirk_entry *quirk_list; static unsigned int quirk_count; static char quirks_param[128]; static int quirks_param_set(const char *value, const struct kernel_param *kp) { char *val, *p, *field; u16 vid, pid; u32 flags; size_t i; int err; val = kstrdup(value, GFP_KERNEL); if (!val) return -ENOMEM; err = param_set_copystring(val, kp); if (err) { kfree(val); return err; } mutex_lock(&quirk_mutex); if (!*val) { quirk_count = 0; kfree(quirk_list); quirk_list = NULL; goto unlock; } for (quirk_count = 1, i = 0; val[i]; i++) if (val[i] == ',') quirk_count++; if (quirk_list) { kfree(quirk_list); quirk_list = NULL; } quirk_list = kcalloc(quirk_count, sizeof(struct quirk_entry), GFP_KERNEL); if (!quirk_list) { quirk_count = 0; mutex_unlock(&quirk_mutex); kfree(val); return -ENOMEM; } for (i = 0, p = val; p && *p;) { /* Each entry consists of VID:PID:flags */ field = strsep(&p, ":"); if (!field) break; if (kstrtou16(field, 16, &vid)) break; field = strsep(&p, ":"); if (!field) break; if (kstrtou16(field, 16, &pid)) break; field = strsep(&p, ","); if (!field || !*field) break; /* Collect the flags */ for (flags = 0; *field; field++) { switch (*field) { case 'a': flags |= USB_QUIRK_STRING_FETCH_255; break; case 'b': flags |= USB_QUIRK_RESET_RESUME; break; case 'c': flags |= USB_QUIRK_NO_SET_INTF; break; case 'd': flags |= USB_QUIRK_CONFIG_INTF_STRINGS; break; case 'e': flags |= USB_QUIRK_RESET; break; case 'f': flags |= USB_QUIRK_HONOR_BNUMINTERFACES; break; case 'g': flags |= USB_QUIRK_DELAY_INIT; break; case 'h': flags |= USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL; break; case 'i': flags |= USB_QUIRK_DEVICE_QUALIFIER; break; case 'j': flags |= USB_QUIRK_IGNORE_REMOTE_WAKEUP; break; case 'k': flags |= USB_QUIRK_NO_LPM; break; case 'l': flags |= USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL; break; case 'm': flags |= USB_QUIRK_DISCONNECT_SUSPEND; break; case 'n': flags |= USB_QUIRK_DELAY_CTRL_MSG; break; case 'o': flags |= USB_QUIRK_HUB_SLOW_RESET; break; case 'p': flags |= USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT; break; /* Ignore unrecognized flag characters */ } } quirk_list[i++] = (struct quirk_entry) { .vid = vid, .pid = pid, .flags = flags }; } if (i < quirk_count) quirk_count = i; unlock: mutex_unlock(&quirk_mutex); kfree(val); return 0; } static const struct kernel_param_ops quirks_param_ops = { .set = quirks_param_set, .get = param_get_string, }; static struct kparam_string quirks_param_string = { .maxlen = sizeof(quirks_param), .string = quirks_param, }; device_param_cb(quirks, &quirks_param_ops, &quirks_param_string, 0644); MODULE_PARM_DESC(quirks, "Add/modify USB quirks by specifying quirks=vendorID:productID:quirks"); /* Lists of quirky USB devices, split in device quirks and interface quirks. * Device quirks are applied at the very beginning of the enumeration process, * right after reading the device descriptor. They can thus only match on device * information. * * Interface quirks are applied after reading all the configuration descriptors. * They can match on both device and interface information. * * Note that the DELAY_INIT and HONOR_BNUMINTERFACES quirks do not make sense as * interface quirks, as they only influence the enumeration process which is run * before processing the interface quirks. * * Please keep the lists ordered by: * 1) Vendor ID * 2) Product ID * 3) Class ID */ static const struct usb_device_id usb_quirk_list[] = { /* CBM - Flash disk */ { USB_DEVICE(0x0204, 0x6025), .driver_info = USB_QUIRK_RESET_RESUME }, /* WORLDE Controller KS49 or Prodipe MIDI 49C USB controller */ { USB_DEVICE(0x0218, 0x0201), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* WORLDE easy key (easykey.25) MIDI controller */ { USB_DEVICE(0x0218, 0x0401), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* HP 5300/5370C scanner */ { USB_DEVICE(0x03f0, 0x0701), .driver_info = USB_QUIRK_STRING_FETCH_255 }, /* HP v222w 16GB Mini USB Drive */ { USB_DEVICE(0x03f0, 0x3f40), .driver_info = USB_QUIRK_DELAY_INIT }, /* Creative SB Audigy 2 NX */ { USB_DEVICE(0x041e, 0x3020), .driver_info = USB_QUIRK_RESET_RESUME }, /* USB3503 */ { USB_DEVICE(0x0424, 0x3503), .driver_info = USB_QUIRK_RESET_RESUME }, /* Microsoft Wireless Laser Mouse 6000 Receiver */ { USB_DEVICE(0x045e, 0x00e1), .driver_info = USB_QUIRK_RESET_RESUME }, /* Microsoft LifeCam-VX700 v2.0 */ { USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME }, /* Microsoft Surface Dock Ethernet (RTL8153 GigE) */ { USB_DEVICE(0x045e, 0x07c6), .driver_info = USB_QUIRK_NO_LPM }, /* Cherry Stream G230 2.0 (G85-231) and 3.0 (G85-232) */ { USB_DEVICE(0x046a, 0x0023), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech HD Webcam C270 */ { USB_DEVICE(0x046d, 0x0825), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech HD Pro Webcams C920, C920-C, C922, C925e and C930e */ { USB_DEVICE(0x046d, 0x082d), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x0841), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x0843), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x085b), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x085c), .driver_info = USB_QUIRK_DELAY_INIT }, /* Logitech ConferenceCam CC3000e */ { USB_DEVICE(0x046d, 0x0847), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x0848), .driver_info = USB_QUIRK_DELAY_INIT }, /* Logitech PTZ Pro Camera */ { USB_DEVICE(0x046d, 0x0853), .driver_info = USB_QUIRK_DELAY_INIT }, /* Logitech Screen Share */ { USB_DEVICE(0x046d, 0x086c), .driver_info = USB_QUIRK_NO_LPM }, /* Logitech Quickcam Fusion */ { USB_DEVICE(0x046d, 0x08c1), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Quickcam Orbit MP */ { USB_DEVICE(0x046d, 0x08c2), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Quickcam Pro for Notebook */ { USB_DEVICE(0x046d, 0x08c3), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Quickcam Pro 5000 */ { USB_DEVICE(0x046d, 0x08c5), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Quickcam OEM Dell Notebook */ { USB_DEVICE(0x046d, 0x08c6), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Quickcam OEM Cisco VT Camera II */ { USB_DEVICE(0x046d, 0x08c7), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Harmony 700-series */ { USB_DEVICE(0x046d, 0xc122), .driver_info = USB_QUIRK_DELAY_INIT }, /* Philips PSC805 audio device */ { USB_DEVICE(0x0471, 0x0155), .driver_info = USB_QUIRK_RESET_RESUME }, /* Plantronic Audio 655 DSP */ { USB_DEVICE(0x047f, 0xc008), .driver_info = USB_QUIRK_RESET_RESUME }, /* Plantronic Audio 648 USB */ { USB_DEVICE(0x047f, 0xc013), .driver_info = USB_QUIRK_RESET_RESUME }, /* Artisman Watchdog Dongle */ { USB_DEVICE(0x04b4, 0x0526), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Microchip Joss Optical infrared touchboard device */ { USB_DEVICE(0x04d8, 0x000c), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* CarrolTouch 4000U */ { USB_DEVICE(0x04e7, 0x0009), .driver_info = USB_QUIRK_RESET_RESUME }, /* CarrolTouch 4500U */ { USB_DEVICE(0x04e7, 0x0030), .driver_info = USB_QUIRK_RESET_RESUME }, /* Samsung Android phone modem - ID conflict with SPH-I500 */ { USB_DEVICE(0x04e8, 0x6601), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Elan Touchscreen */ { USB_DEVICE(0x04f3, 0x0089), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, { USB_DEVICE(0x04f3, 0x009b), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, { USB_DEVICE(0x04f3, 0x010c), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, { USB_DEVICE(0x04f3, 0x0125), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, { USB_DEVICE(0x04f3, 0x016f), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, { USB_DEVICE(0x04f3, 0x0381), .driver_info = USB_QUIRK_NO_LPM }, { USB_DEVICE(0x04f3, 0x21b8), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, /* Roland SC-8820 */ { USB_DEVICE(0x0582, 0x0007), .driver_info = USB_QUIRK_RESET_RESUME }, /* Edirol SD-20 */ { USB_DEVICE(0x0582, 0x0027), .driver_info = USB_QUIRK_RESET_RESUME }, /* Alcor Micro Corp. Hub */ { USB_DEVICE(0x058f, 0x9254), .driver_info = USB_QUIRK_RESET_RESUME }, /* appletouch */ { USB_DEVICE(0x05ac, 0x021a), .driver_info = USB_QUIRK_RESET_RESUME }, /* Genesys Logic hub, internally used by KY-688 USB 3.1 Type-C Hub */ { USB_DEVICE(0x05e3, 0x0612), .driver_info = USB_QUIRK_NO_LPM }, /* ELSA MicroLink 56K */ { USB_DEVICE(0x05cc, 0x2267), .driver_info = USB_QUIRK_RESET_RESUME }, /* Genesys Logic hub, internally used by Moshi USB to Ethernet Adapter */ { USB_DEVICE(0x05e3, 0x0616), .driver_info = USB_QUIRK_NO_LPM }, /* Avision AV600U */ { USB_DEVICE(0x0638, 0x0a13), .driver_info = USB_QUIRK_STRING_FETCH_255 }, /* Saitek Cyborg Gold Joystick */ { USB_DEVICE(0x06a3, 0x0006), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Agfa SNAPSCAN 1212U */ { USB_DEVICE(0x06bd, 0x0001), .driver_info = USB_QUIRK_RESET_RESUME }, /* Guillemot Webcam Hercules Dualpix Exchange (2nd ID) */ { USB_DEVICE(0x06f8, 0x0804), .driver_info = USB_QUIRK_RESET_RESUME }, /* Guillemot Webcam Hercules Dualpix Exchange*/ { USB_DEVICE(0x06f8, 0x3005), .driver_info = USB_QUIRK_RESET_RESUME }, /* Guillemot Hercules DJ Console audio card (BZ 208357) */ { USB_DEVICE(0x06f8, 0xb000), .driver_info = USB_QUIRK_ENDPOINT_IGNORE }, /* Midiman M-Audio Keystation 88es */ { USB_DEVICE(0x0763, 0x0192), .driver_info = USB_QUIRK_RESET_RESUME }, /* SanDisk Ultra Fit and Ultra Flair */ { USB_DEVICE(0x0781, 0x5583), .driver_info = USB_QUIRK_NO_LPM }, { USB_DEVICE(0x0781, 0x5591), .driver_info = USB_QUIRK_NO_LPM }, /* Realforce 87U Keyboard */ { USB_DEVICE(0x0853, 0x011b), .driver_info = USB_QUIRK_NO_LPM }, /* M-Systems Flash Disk Pioneers */ { USB_DEVICE(0x08ec, 0x1000), .driver_info = USB_QUIRK_RESET_RESUME }, /* Baum Vario Ultra */ { USB_DEVICE(0x0904, 0x6101), .driver_info = USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL }, { USB_DEVICE(0x0904, 0x6102), .driver_info = USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL }, { USB_DEVICE(0x0904, 0x6103), .driver_info = USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL }, /* Sound Devices USBPre2 */ { USB_DEVICE(0x0926, 0x0202), .driver_info = USB_QUIRK_ENDPOINT_IGNORE }, /* Sound Devices MixPre-D */ { USB_DEVICE(0x0926, 0x0208), .driver_info = USB_QUIRK_ENDPOINT_IGNORE }, /* Keytouch QWERTY Panel keyboard */ { USB_DEVICE(0x0926, 0x3333), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Kingston DataTraveler 3.0 */ { USB_DEVICE(0x0951, 0x1666), .driver_info = USB_QUIRK_NO_LPM }, /* NVIDIA Jetson devices in Force Recovery mode */ { USB_DEVICE(0x0955, 0x7018), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7019), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7418), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7721), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7c18), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7e19), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7f21), .driver_info = USB_QUIRK_RESET_RESUME }, /* X-Rite/Gretag-Macbeth Eye-One Pro display colorimeter */ { USB_DEVICE(0x0971, 0x2000), .driver_info = USB_QUIRK_NO_SET_INTF }, /* ELMO L-12F document camera */ { USB_DEVICE(0x09a1, 0x0028), .driver_info = USB_QUIRK_DELAY_CTRL_MSG }, /* Broadcom BCM92035DGROM BT dongle */ { USB_DEVICE(0x0a5c, 0x2021), .driver_info = USB_QUIRK_RESET_RESUME }, /* MAYA44USB sound device */ { USB_DEVICE(0x0a92, 0x0091), .driver_info = USB_QUIRK_RESET_RESUME }, /* ASUS Base Station(T100) */ { USB_DEVICE(0x0b05, 0x17e0), .driver_info = USB_QUIRK_IGNORE_REMOTE_WAKEUP }, /* Realtek Semiconductor Corp. Mass Storage Device (Multicard Reader)*/ { USB_DEVICE(0x0bda, 0x0151), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Realtek hub in Dell WD19 (Type-C) */ { USB_DEVICE(0x0bda, 0x0487), .driver_info = USB_QUIRK_NO_LPM }, /* Generic RTL8153 based ethernet adapters */ { USB_DEVICE(0x0bda, 0x8153), .driver_info = USB_QUIRK_NO_LPM }, /* SONiX USB DEVICE Touchpad */ { USB_DEVICE(0x0c45, 0x7056), .driver_info = USB_QUIRK_IGNORE_REMOTE_WAKEUP }, /* Action Semiconductor flash disk */ { USB_DEVICE(0x10d6, 0x2200), .driver_info = USB_QUIRK_STRING_FETCH_255 }, /* novation SoundControl XL */ { USB_DEVICE(0x1235, 0x0061), .driver_info = USB_QUIRK_RESET_RESUME }, /* Focusrite Scarlett Solo USB */ { USB_DEVICE(0x1235, 0x8211), .driver_info = USB_QUIRK_DISCONNECT_SUSPEND }, /* Huawei 4G LTE module */ { USB_DEVICE(0x12d1, 0x15bb), .driver_info = USB_QUIRK_DISCONNECT_SUSPEND }, { USB_DEVICE(0x12d1, 0x15c3), .driver_info = USB_QUIRK_DISCONNECT_SUSPEND }, /* SKYMEDI USB_DRIVE */ { USB_DEVICE(0x1516, 0x8628), .driver_info = USB_QUIRK_RESET_RESUME }, /* Razer - Razer Blade Keyboard */ { USB_DEVICE(0x1532, 0x0116), .driver_info = USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL }, /* Lenovo ThinkPad OneLink+ Dock twin hub controllers (VIA Labs VL812) */ { USB_DEVICE(0x17ef, 0x1018), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x17ef, 0x1019), .driver_info = USB_QUIRK_RESET_RESUME }, /* Lenovo USB-C to Ethernet Adapter RTL8153-04 */ { USB_DEVICE(0x17ef, 0x720c), .driver_info = USB_QUIRK_NO_LPM }, /* Lenovo Powered USB-C Travel Hub (4X90S92381, RTL8153 GigE) */ { USB_DEVICE(0x17ef, 0x721e), .driver_info = USB_QUIRK_NO_LPM }, /* Lenovo ThinkCenter A630Z TI024Gen3 usb-audio */ { USB_DEVICE(0x17ef, 0xa012), .driver_info = USB_QUIRK_DISCONNECT_SUSPEND }, /* Lenovo ThinkPad USB-C Dock Gen2 Ethernet (RTL8153 GigE) */ { USB_DEVICE(0x17ef, 0xa387), .driver_info = USB_QUIRK_NO_LPM }, /* BUILDWIN Photo Frame */ { USB_DEVICE(0x1908, 0x1315), .driver_info = USB_QUIRK_HONOR_BNUMINTERFACES }, /* Protocol and OTG Electrical Test Device */ { USB_DEVICE(0x1a0a, 0x0200), .driver_info = USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL }, /* Terminus Technology Inc. Hub */ { USB_DEVICE(0x1a40, 0x0101), .driver_info = USB_QUIRK_HUB_SLOW_RESET }, /* Corsair K70 RGB */ { USB_DEVICE(0x1b1c, 0x1b13), .driver_info = USB_QUIRK_DELAY_INIT | USB_QUIRK_DELAY_CTRL_MSG }, /* Corsair Strafe */ { USB_DEVICE(0x1b1c, 0x1b15), .driver_info = USB_QUIRK_DELAY_INIT | USB_QUIRK_DELAY_CTRL_MSG }, /* Corsair Strafe RGB */ { USB_DEVICE(0x1b1c, 0x1b20), .driver_info = USB_QUIRK_DELAY_INIT | USB_QUIRK_DELAY_CTRL_MSG }, /* Corsair K70 LUX RGB */ { USB_DEVICE(0x1b1c, 0x1b33), .driver_info = USB_QUIRK_DELAY_INIT }, /* Corsair K70 LUX */ { USB_DEVICE(0x1b1c, 0x1b36), .driver_info = USB_QUIRK_DELAY_INIT }, /* Corsair K70 RGB RAPDIFIRE */ { USB_DEVICE(0x1b1c, 0x1b38), .driver_info = USB_QUIRK_DELAY_INIT | USB_QUIRK_DELAY_CTRL_MSG }, /* MIDI keyboard WORLDE MINI */ { USB_DEVICE(0x1c75, 0x0204), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Acer C120 LED Projector */ { USB_DEVICE(0x1de1, 0xc102), .driver_info = USB_QUIRK_NO_LPM }, /* Blackmagic Design Intensity Shuttle */ { USB_DEVICE(0x1edb, 0xbd3b), .driver_info = USB_QUIRK_NO_LPM }, /* Blackmagic Design UltraStudio SDI */ { USB_DEVICE(0x1edb, 0xbd4f), .driver_info = USB_QUIRK_NO_LPM }, /* Hauppauge HVR-950q */ { USB_DEVICE(0x2040, 0x7200), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Raydium Touchscreen */ { USB_DEVICE(0x2386, 0x3114), .driver_info = USB_QUIRK_NO_LPM }, { USB_DEVICE(0x2386, 0x3119), .driver_info = USB_QUIRK_NO_LPM }, { USB_DEVICE(0x2386, 0x350e), .driver_info = USB_QUIRK_NO_LPM }, /* APTIV AUTOMOTIVE HUB */ { USB_DEVICE(0x2c48, 0x0132), .driver_info = USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT }, /* DJI CineSSD */ { USB_DEVICE(0x2ca3, 0x0031), .driver_info = USB_QUIRK_NO_LPM }, /* Alcor Link AK9563 SC Reader used in 2022 Lenovo ThinkPads */ { USB_DEVICE(0x2ce3, 0x9563), .driver_info = USB_QUIRK_NO_LPM }, /* DELL USB GEN2 */ { USB_DEVICE(0x413c, 0xb062), .driver_info = USB_QUIRK_NO_LPM | USB_QUIRK_RESET_RESUME }, /* VCOM device */ { USB_DEVICE(0x4296, 0x7570), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* INTEL VALUE SSD */ { USB_DEVICE(0x8086, 0xf1a5), .driver_info = USB_QUIRK_RESET_RESUME }, { } /* terminating entry must be last */ }; static const struct usb_device_id usb_interface_quirk_list[] = { /* Logitech UVC Cameras */ { USB_VENDOR_AND_INTERFACE_INFO(0x046d, USB_CLASS_VIDEO, 1, 0), .driver_info = USB_QUIRK_RESET_RESUME }, { } /* terminating entry must be last */ }; static const struct usb_device_id usb_amd_resume_quirk_list[] = { /* Lenovo Mouse with Pixart controller */ { USB_DEVICE(0x17ef, 0x602e), .driver_info = USB_QUIRK_RESET_RESUME }, /* Pixart Mouse */ { USB_DEVICE(0x093a, 0x2500), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x093a, 0x2510), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x093a, 0x2521), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x03f0, 0x2b4a), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Optical Mouse M90/M100 */ { USB_DEVICE(0x046d, 0xc05a), .driver_info = USB_QUIRK_RESET_RESUME }, { } /* terminating entry must be last */ }; /* * Entries for endpoints that should be ignored when parsing configuration * descriptors. * * Matched for devices with USB_QUIRK_ENDPOINT_IGNORE. */ static const struct usb_device_id usb_endpoint_ignore[] = { { USB_DEVICE_INTERFACE_NUMBER(0x06f8, 0xb000, 5), .driver_info = 0x01 }, { USB_DEVICE_INTERFACE_NUMBER(0x06f8, 0xb000, 5), .driver_info = 0x81 }, { USB_DEVICE_INTERFACE_NUMBER(0x0926, 0x0202, 1), .driver_info = 0x85 }, { USB_DEVICE_INTERFACE_NUMBER(0x0926, 0x0208, 1), .driver_info = 0x85 }, { } }; bool usb_endpoint_is_ignored(struct usb_device *udev, struct usb_host_interface *intf, struct usb_endpoint_descriptor *epd) { const struct usb_device_id *id; unsigned int address; for (id = usb_endpoint_ignore; id->match_flags; ++id) { if (!usb_match_device(udev, id)) continue; if (!usb_match_one_id_intf(udev, intf, id)) continue; address = id->driver_info; if (address == epd->bEndpointAddress) return true; } return false; } static bool usb_match_any_interface(struct usb_device *udev, const struct usb_device_id *id) { unsigned int i; for (i = 0; i < udev->descriptor.bNumConfigurations; ++i) { struct usb_host_config *cfg = &udev->config[i]; unsigned int j; for (j = 0; j < cfg->desc.bNumInterfaces; ++j) { struct usb_interface_cache *cache; struct usb_host_interface *intf; cache = cfg->intf_cache[j]; if (cache->num_altsetting == 0) continue; intf = &cache->altsetting[0]; if (usb_match_one_id_intf(udev, intf, id)) return true; } } return false; } static int usb_amd_resume_quirk(struct usb_device *udev) { struct usb_hcd *hcd; hcd = bus_to_hcd(udev->bus); /* The device should be attached directly to root hub */ if (udev->level == 1 && hcd->amd_resume_bug == 1) return 1; return 0; } static u32 usb_detect_static_quirks(struct usb_device *udev, const struct usb_device_id *id) { u32 quirks = 0; for (; id->match_flags; id++) { if (!usb_match_device(udev, id)) continue; if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_INFO) && !usb_match_any_interface(udev, id)) continue; quirks |= (u32)(id->driver_info); } return quirks; } static u32 usb_detect_dynamic_quirks(struct usb_device *udev) { u16 vid = le16_to_cpu(udev->descriptor.idVendor); u16 pid = le16_to_cpu(udev->descriptor.idProduct); int i, flags = 0; mutex_lock(&quirk_mutex); for (i = 0; i < quirk_count; i++) { if (vid == quirk_list[i].vid && pid == quirk_list[i].pid) { flags = quirk_list[i].flags; break; } } mutex_unlock(&quirk_mutex); return flags; } /* * Detect any quirks the device has, and do any housekeeping for it if needed. */ void usb_detect_quirks(struct usb_device *udev) { udev->quirks = usb_detect_static_quirks(udev, usb_quirk_list); /* * Pixart-based mice would trigger remote wakeup issue on AMD * Yangtze chipset, so set them as RESET_RESUME flag. */ if (usb_amd_resume_quirk(udev)) udev->quirks |= usb_detect_static_quirks(udev, usb_amd_resume_quirk_list); udev->quirks ^= usb_detect_dynamic_quirks(udev); if (udev->quirks) dev_dbg(&udev->dev, "USB quirks for this device: %x\n", udev->quirks); #ifdef CONFIG_USB_DEFAULT_PERSIST if (!(udev->quirks & USB_QUIRK_RESET)) udev->persist_enabled = 1; #else /* Hubs are automatically enabled for USB-PERSIST */ if (udev->descriptor.bDeviceClass == USB_CLASS_HUB) udev->persist_enabled = 1; #endif /* CONFIG_USB_DEFAULT_PERSIST */ } void usb_detect_interface_quirks(struct usb_device *udev) { u32 quirks; quirks = usb_detect_static_quirks(udev, usb_interface_quirk_list); if (quirks == 0) return; dev_dbg(&udev->dev, "USB interface quirks for this device: %x\n", quirks); udev->quirks |= quirks; } void usb_release_quirk_list(void) { mutex_lock(&quirk_mutex); kfree(quirk_list); quirk_list = NULL; mutex_unlock(&quirk_mutex); }
6 3 5 4 5 24 24 23 22 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct debug_req_info { struct ethnl_req_info base; }; struct debug_reply_data { struct ethnl_reply_data base; u32 msg_mask; }; #define DEBUG_REPDATA(__reply_base) \ container_of(__reply_base, struct debug_reply_data, base) const struct nla_policy ethnl_debug_get_policy[] = { [ETHTOOL_A_DEBUG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int debug_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct debug_reply_data *data = DEBUG_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_msglevel) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->msg_mask = dev->ethtool_ops->get_msglevel(dev); ethnl_ops_complete(dev); return 0; } static int debug_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct debug_reply_data *data = DEBUG_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; return ethnl_bitset32_size(&data->msg_mask, NULL, NETIF_MSG_CLASS_COUNT, netif_msg_class_names, compact); } static int debug_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct debug_reply_data *data = DEBUG_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; return ethnl_put_bitset32(skb, ETHTOOL_A_DEBUG_MSGMASK, &data->msg_mask, NULL, NETIF_MSG_CLASS_COUNT, netif_msg_class_names, compact); } /* DEBUG_SET */ const struct nla_policy ethnl_debug_set_policy[] = { [ETHTOOL_A_DEBUG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_NESTED }, }; static int ethnl_set_debug_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_msglevel && ops->set_msglevel ? 1 : -EOPNOTSUPP; } static int ethnl_set_debug(struct ethnl_req_info *req_info, struct genl_info *info) { struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; bool mod = false; u32 msg_mask; int ret; msg_mask = dev->ethtool_ops->get_msglevel(dev); ret = ethnl_update_bitset32(&msg_mask, NETIF_MSG_CLASS_COUNT, tb[ETHTOOL_A_DEBUG_MSGMASK], netif_msg_class_names, info->extack, &mod); if (ret < 0 || !mod) return ret; dev->ethtool_ops->set_msglevel(dev, msg_mask); return 1; } const struct ethnl_request_ops ethnl_debug_request_ops = { .request_cmd = ETHTOOL_MSG_DEBUG_GET, .reply_cmd = ETHTOOL_MSG_DEBUG_GET_REPLY, .hdr_attr = ETHTOOL_A_DEBUG_HEADER, .req_info_size = sizeof(struct debug_req_info), .reply_data_size = sizeof(struct debug_reply_data), .prepare_data = debug_prepare_data, .reply_size = debug_reply_size, .fill_reply = debug_fill_reply, .set_validate = ethnl_set_debug_validate, .set = ethnl_set_debug, .set_ntf_cmd = ETHTOOL_MSG_DEBUG_NTF, };
52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 /* * Copyright (C) 2011-2013 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef DRM_RECT_H #define DRM_RECT_H #include <linux/types.h> /** * DOC: rect utils * * Utility functions to help manage rectangular areas for * clipping, scaling, etc. calculations. */ /** * struct drm_rect - two dimensional rectangle * @x1: horizontal starting coordinate (inclusive) * @x2: horizontal ending coordinate (exclusive) * @y1: vertical starting coordinate (inclusive) * @y2: vertical ending coordinate (exclusive) * * Note that this must match the layout of struct drm_mode_rect or the damage * helpers like drm_atomic_helper_damage_iter_init() break. */ struct drm_rect { int x1, y1, x2, y2; }; /** * DRM_RECT_INIT - initialize a rectangle from x/y/w/h * @x: x coordinate * @y: y coordinate * @w: width * @h: height * * RETURNS: * A new rectangle of the specified size. */ #define DRM_RECT_INIT(x, y, w, h) ((struct drm_rect){ \ .x1 = (x), \ .y1 = (y), \ .x2 = (x) + (w), \ .y2 = (y) + (h) }) /** * DRM_RECT_FMT - printf string for &struct drm_rect */ #define DRM_RECT_FMT "%dx%d%+d%+d" /** * DRM_RECT_ARG - printf arguments for &struct drm_rect * @r: rectangle struct */ #define DRM_RECT_ARG(r) drm_rect_width(r), drm_rect_height(r), (r)->x1, (r)->y1 /** * DRM_RECT_FP_FMT - printf string for &struct drm_rect in 16.16 fixed point */ #define DRM_RECT_FP_FMT "%d.%06ux%d.%06u%+d.%06u%+d.%06u" /** * DRM_RECT_FP_ARG - printf arguments for &struct drm_rect in 16.16 fixed point * @r: rectangle struct * * This is useful for e.g. printing plane source rectangles, which are in 16.16 * fixed point. */ #define DRM_RECT_FP_ARG(r) \ drm_rect_width(r) >> 16, ((drm_rect_width(r) & 0xffff) * 15625) >> 10, \ drm_rect_height(r) >> 16, ((drm_rect_height(r) & 0xffff) * 15625) >> 10, \ (r)->x1 >> 16, (((r)->x1 & 0xffff) * 15625) >> 10, \ (r)->y1 >> 16, (((r)->y1 & 0xffff) * 15625) >> 10 /** * drm_rect_init - initialize the rectangle from x/y/w/h * @r: rectangle * @x: x coordinate * @y: y coordinate * @width: width * @height: height */ static inline void drm_rect_init(struct drm_rect *r, int x, int y, int width, int height) { r->x1 = x; r->y1 = y; r->x2 = x + width; r->y2 = y + height; } /** * drm_rect_adjust_size - adjust the size of the rectangle * @r: rectangle to be adjusted * @dw: horizontal adjustment * @dh: vertical adjustment * * Change the size of rectangle @r by @dw in the horizontal direction, * and by @dh in the vertical direction, while keeping the center * of @r stationary. * * Positive @dw and @dh increase the size, negative values decrease it. */ static inline void drm_rect_adjust_size(struct drm_rect *r, int dw, int dh) { r->x1 -= dw >> 1; r->y1 -= dh >> 1; r->x2 += (dw + 1) >> 1; r->y2 += (dh + 1) >> 1; } /** * drm_rect_translate - translate the rectangle * @r: rectangle to be tranlated * @dx: horizontal translation * @dy: vertical translation * * Move rectangle @r by @dx in the horizontal direction, * and by @dy in the vertical direction. */ static inline void drm_rect_translate(struct drm_rect *r, int dx, int dy) { r->x1 += dx; r->y1 += dy; r->x2 += dx; r->y2 += dy; } /** * drm_rect_translate_to - translate the rectangle to an absolute position * @r: rectangle to be tranlated * @x: horizontal position * @y: vertical position * * Move rectangle @r to @x in the horizontal direction, * and to @y in the vertical direction. */ static inline void drm_rect_translate_to(struct drm_rect *r, int x, int y) { drm_rect_translate(r, x - r->x1, y - r->y1); } /** * drm_rect_downscale - downscale a rectangle * @r: rectangle to be downscaled * @horz: horizontal downscale factor * @vert: vertical downscale factor * * Divide the coordinates of rectangle @r by @horz and @vert. */ static inline void drm_rect_downscale(struct drm_rect *r, int horz, int vert) { r->x1 /= horz; r->y1 /= vert; r->x2 /= horz; r->y2 /= vert; } /** * drm_rect_width - determine the rectangle width * @r: rectangle whose width is returned * * RETURNS: * The width of the rectangle. */ static inline int drm_rect_width(const struct drm_rect *r) { return r->x2 - r->x1; } /** * drm_rect_height - determine the rectangle height * @r: rectangle whose height is returned * * RETURNS: * The height of the rectangle. */ static inline int drm_rect_height(const struct drm_rect *r) { return r->y2 - r->y1; } /** * drm_rect_visible - determine if the rectangle is visible * @r: rectangle whose visibility is returned * * RETURNS: * %true if the rectangle is visible, %false otherwise. */ static inline bool drm_rect_visible(const struct drm_rect *r) { return drm_rect_width(r) > 0 && drm_rect_height(r) > 0; } /** * drm_rect_equals - determine if two rectangles are equal * @r1: first rectangle * @r2: second rectangle * * RETURNS: * %true if the rectangles are equal, %false otherwise. */ static inline bool drm_rect_equals(const struct drm_rect *r1, const struct drm_rect *r2) { return r1->x1 == r2->x1 && r1->x2 == r2->x2 && r1->y1 == r2->y1 && r1->y2 == r2->y2; } /** * drm_rect_fp_to_int - Convert a rect in 16.16 fixed point form to int form. * @dst: rect to be stored the converted value * @src: rect in 16.16 fixed point form */ static inline void drm_rect_fp_to_int(struct drm_rect *dst, const struct drm_rect *src) { drm_rect_init(dst, src->x1 >> 16, src->y1 >> 16, drm_rect_width(src) >> 16, drm_rect_height(src) >> 16); } bool drm_rect_intersect(struct drm_rect *r, const struct drm_rect *clip); bool drm_rect_clip_scaled(struct drm_rect *src, struct drm_rect *dst, const struct drm_rect *clip); int drm_rect_calc_hscale(const struct drm_rect *src, const struct drm_rect *dst, int min_hscale, int max_hscale); int drm_rect_calc_vscale(const struct drm_rect *src, const struct drm_rect *dst, int min_vscale, int max_vscale); void drm_rect_debug_print(const char *prefix, const struct drm_rect *r, bool fixed_point); void drm_rect_rotate(struct drm_rect *r, int width, int height, unsigned int rotation); void drm_rect_rotate_inv(struct drm_rect *r, int width, int height, unsigned int rotation); #endif
32 114 6 202 201 53 11 51 170 159 19 5 5 83 83 18 65 33 6 27 33 33 53 54 38 16 3 2 48 20 4 16 4 13 3 33 4 60 20 28 13 28 27 2 12 14 24 3 26 9 1 8 109 67 43 11 7 3 18 2 3 5 6 2 4 3 8 9 2 5 1 4 3 11 9 16 6 1 4 5 9 114 1 1 4 1 1 2 2 16 89 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Modified by Fred N. van Kempen, 01/29/93, to add line disciplines * which can be dynamically activated and de-activated by the line * discipline handling modules (like SLIP). */ #include <linux/bits.h> #include <linux/types.h> #include <linux/termios.h> #include <linux/errno.h> #include <linux/sched/signal.h> #include <linux/kernel.h> #include <linux/major.h> #include <linux/tty.h> #include <linux/fcntl.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/mutex.h> #include <linux/compat.h> #include <linux/termios_internal.h> #include "tty.h" #include <asm/io.h> #include <linux/uaccess.h> #undef DEBUG /* * Internal flag options for termios setting behavior */ #define TERMIOS_FLUSH BIT(0) #define TERMIOS_WAIT BIT(1) #define TERMIOS_TERMIO BIT(2) #define TERMIOS_OLD BIT(3) /** * tty_chars_in_buffer - characters pending * @tty: terminal * * Returns: the number of bytes of data in the device private output queue. If * no private method is supplied there is assumed to be no queue on the device. */ unsigned int tty_chars_in_buffer(struct tty_struct *tty) { if (tty->ops->chars_in_buffer) return tty->ops->chars_in_buffer(tty); return 0; } EXPORT_SYMBOL(tty_chars_in_buffer); /** * tty_write_room - write queue space * @tty: terminal * * Returns: the number of bytes that can be queued to this device at the present * time. The result should be treated as a guarantee and the driver cannot * offer a value it later shrinks by more than the number of bytes written. If * no method is provided, 2K is always returned and data may be lost as there * will be no flow control. */ unsigned int tty_write_room(struct tty_struct *tty) { if (tty->ops->write_room) return tty->ops->write_room(tty); return 2048; } EXPORT_SYMBOL(tty_write_room); /** * tty_driver_flush_buffer - discard internal buffer * @tty: terminal * * Discard the internal output buffer for this device. If no method is provided, * then either the buffer cannot be hardware flushed or there is no buffer * driver side. */ void tty_driver_flush_buffer(struct tty_struct *tty) { if (tty->ops->flush_buffer) tty->ops->flush_buffer(tty); } EXPORT_SYMBOL(tty_driver_flush_buffer); /** * tty_unthrottle - flow control * @tty: terminal * * Indicate that a @tty may continue transmitting data down the stack. Takes * the &tty_struct->termios_rwsem to protect against parallel * throttle/unthrottle and also to ensure the driver can consistently reference * its own termios data at this point when implementing software flow control. * * Drivers should however remember that the stack can issue a throttle, then * change flow control method, then unthrottle. */ void tty_unthrottle(struct tty_struct *tty) { down_write(&tty->termios_rwsem); if (test_and_clear_bit(TTY_THROTTLED, &tty->flags) && tty->ops->unthrottle) tty->ops->unthrottle(tty); tty->flow_change = TTY_FLOW_NO_CHANGE; up_write(&tty->termios_rwsem); } EXPORT_SYMBOL(tty_unthrottle); /** * tty_throttle_safe - flow control * @tty: terminal * * Indicate that a @tty should stop transmitting data down the stack. * tty_throttle_safe() will only attempt throttle if @tty->flow_change is * %TTY_THROTTLE_SAFE. Prevents an accidental throttle due to race conditions * when throttling is conditional on factors evaluated prior to throttling. * * Returns: %true if @tty is throttled (or was already throttled) */ bool tty_throttle_safe(struct tty_struct *tty) { bool ret = true; mutex_lock(&tty->throttle_mutex); if (!tty_throttled(tty)) { if (tty->flow_change != TTY_THROTTLE_SAFE) ret = false; else { set_bit(TTY_THROTTLED, &tty->flags); if (tty->ops->throttle) tty->ops->throttle(tty); } } mutex_unlock(&tty->throttle_mutex); return ret; } /** * tty_unthrottle_safe - flow control * @tty: terminal * * Similar to tty_unthrottle() but will only attempt unthrottle if * @tty->flow_change is %TTY_UNTHROTTLE_SAFE. Prevents an accidental unthrottle * due to race conditions when unthrottling is conditional on factors evaluated * prior to unthrottling. * * Returns: %true if @tty is unthrottled (or was already unthrottled) */ bool tty_unthrottle_safe(struct tty_struct *tty) { bool ret = true; mutex_lock(&tty->throttle_mutex); if (tty_throttled(tty)) { if (tty->flow_change != TTY_UNTHROTTLE_SAFE) ret = false; else { clear_bit(TTY_THROTTLED, &tty->flags); if (tty->ops->unthrottle) tty->ops->unthrottle(tty); } } mutex_unlock(&tty->throttle_mutex); return ret; } /** * tty_wait_until_sent - wait for I/O to finish * @tty: tty we are waiting for * @timeout: how long we will wait * * Wait for characters pending in a tty driver to hit the wire, or for a * timeout to occur (eg due to flow control). * * Locking: none */ void tty_wait_until_sent(struct tty_struct *tty, long timeout) { if (!timeout) timeout = MAX_SCHEDULE_TIMEOUT; timeout = wait_event_interruptible_timeout(tty->write_wait, !tty_chars_in_buffer(tty), timeout); if (timeout <= 0) return; if (timeout == MAX_SCHEDULE_TIMEOUT) timeout = 0; if (tty->ops->wait_until_sent) tty->ops->wait_until_sent(tty, timeout); } EXPORT_SYMBOL(tty_wait_until_sent); /* * Termios Helper Methods */ static void unset_locked_termios(struct tty_struct *tty, const struct ktermios *old) { struct ktermios *termios = &tty->termios; struct ktermios *locked = &tty->termios_locked; int i; #define NOSET_MASK(x, y, z) (x = ((x) & ~(z)) | ((y) & (z))) NOSET_MASK(termios->c_iflag, old->c_iflag, locked->c_iflag); NOSET_MASK(termios->c_oflag, old->c_oflag, locked->c_oflag); NOSET_MASK(termios->c_cflag, old->c_cflag, locked->c_cflag); NOSET_MASK(termios->c_lflag, old->c_lflag, locked->c_lflag); termios->c_line = locked->c_line ? old->c_line : termios->c_line; for (i = 0; i < NCCS; i++) termios->c_cc[i] = locked->c_cc[i] ? old->c_cc[i] : termios->c_cc[i]; /* FIXME: What should we do for i/ospeed */ } /** * tty_termios_copy_hw - copy hardware settings * @new: new termios * @old: old termios * * Propagate the hardware specific terminal setting bits from the @old termios * structure to the @new one. This is used in cases where the hardware does not * support reconfiguration or as a helper in some cases where only minimal * reconfiguration is supported. */ void tty_termios_copy_hw(struct ktermios *new, const struct ktermios *old) { /* The bits a dumb device handles in software. Smart devices need to always provide a set_termios method */ new->c_cflag &= HUPCL | CREAD | CLOCAL; new->c_cflag |= old->c_cflag & ~(HUPCL | CREAD | CLOCAL); new->c_ispeed = old->c_ispeed; new->c_ospeed = old->c_ospeed; } EXPORT_SYMBOL(tty_termios_copy_hw); /** * tty_termios_hw_change - check for setting change * @a: termios * @b: termios to compare * * Check if any of the bits that affect a dumb device have changed between the * two termios structures, or a speed change is needed. * * Returns: %true if change is needed */ bool tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b) { if (a->c_ispeed != b->c_ispeed || a->c_ospeed != b->c_ospeed) return true; if ((a->c_cflag ^ b->c_cflag) & ~(HUPCL | CREAD | CLOCAL)) return true; return false; } EXPORT_SYMBOL(tty_termios_hw_change); /** * tty_get_char_size - get size of a character * @cflag: termios cflag value * * Returns: size (in bits) of a character depending on @cflag's %CSIZE setting */ unsigned char tty_get_char_size(unsigned int cflag) { switch (cflag & CSIZE) { case CS5: return 5; case CS6: return 6; case CS7: return 7; case CS8: default: return 8; } } EXPORT_SYMBOL_GPL(tty_get_char_size); /** * tty_get_frame_size - get size of a frame * @cflag: termios cflag value * * Get the size (in bits) of a frame depending on @cflag's %CSIZE, %CSTOPB, and * %PARENB setting. The result is a sum of character size, start and stop bits * -- one bit each -- second stop bit (if set), and parity bit (if set). * * Returns: size (in bits) of a frame depending on @cflag's setting. */ unsigned char tty_get_frame_size(unsigned int cflag) { unsigned char bits = 2 + tty_get_char_size(cflag); if (cflag & CSTOPB) bits++; if (cflag & PARENB) bits++; if (cflag & ADDRB) bits++; return bits; } EXPORT_SYMBOL_GPL(tty_get_frame_size); /** * tty_set_termios - update termios values * @tty: tty to update * @new_termios: desired new value * * Perform updates to the termios values set on this @tty. A master pty's * termios should never be set. * * Locking: &tty_struct->termios_rwsem */ int tty_set_termios(struct tty_struct *tty, struct ktermios *new_termios) { struct ktermios old_termios; struct tty_ldisc *ld; WARN_ON(tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER); /* * Perform the actual termios internal changes under lock. */ /* FIXME: we need to decide on some locking/ordering semantics for the set_termios notification eventually */ down_write(&tty->termios_rwsem); old_termios = tty->termios; tty->termios = *new_termios; unset_locked_termios(tty, &old_termios); /* Reset any ADDRB changes, ADDRB is changed through ->rs485_config() */ tty->termios.c_cflag ^= (tty->termios.c_cflag ^ old_termios.c_cflag) & ADDRB; if (tty->ops->set_termios) tty->ops->set_termios(tty, &old_termios); else tty_termios_copy_hw(&tty->termios, &old_termios); ld = tty_ldisc_ref(tty); if (ld != NULL) { if (ld->ops->set_termios) ld->ops->set_termios(tty, &old_termios); tty_ldisc_deref(ld); } up_write(&tty->termios_rwsem); return 0; } EXPORT_SYMBOL_GPL(tty_set_termios); /* * Translate a "termio" structure into a "termios". Ugh. */ __weak int user_termio_to_kernel_termios(struct ktermios *termios, struct termio __user *termio) { struct termio v; if (copy_from_user(&v, termio, sizeof(struct termio))) return -EFAULT; termios->c_iflag = (0xffff0000 & termios->c_iflag) | v.c_iflag; termios->c_oflag = (0xffff0000 & termios->c_oflag) | v.c_oflag; termios->c_cflag = (0xffff0000 & termios->c_cflag) | v.c_cflag; termios->c_lflag = (0xffff0000 & termios->c_lflag) | v.c_lflag; termios->c_line = (0xffff0000 & termios->c_lflag) | v.c_line; memcpy(termios->c_cc, v.c_cc, NCC); return 0; } /* * Translate a "termios" structure into a "termio". Ugh. */ __weak int kernel_termios_to_user_termio(struct termio __user *termio, struct ktermios *termios) { struct termio v; memset(&v, 0, sizeof(struct termio)); v.c_iflag = termios->c_iflag; v.c_oflag = termios->c_oflag; v.c_cflag = termios->c_cflag; v.c_lflag = termios->c_lflag; v.c_line = termios->c_line; memcpy(v.c_cc, termios->c_cc, NCC); return copy_to_user(termio, &v, sizeof(struct termio)); } #ifdef TCGETS2 __weak int user_termios_to_kernel_termios(struct ktermios *k, struct termios2 __user *u) { return copy_from_user(k, u, sizeof(struct termios2)); } __weak int kernel_termios_to_user_termios(struct termios2 __user *u, struct ktermios *k) { return copy_to_user(u, k, sizeof(struct termios2)); } __weak int user_termios_to_kernel_termios_1(struct ktermios *k, struct termios __user *u) { return copy_from_user(k, u, sizeof(struct termios)); } __weak int kernel_termios_to_user_termios_1(struct termios __user *u, struct ktermios *k) { return copy_to_user(u, k, sizeof(struct termios)); } #else __weak int user_termios_to_kernel_termios(struct ktermios *k, struct termios __user *u) { return copy_from_user(k, u, sizeof(struct termios)); } __weak int kernel_termios_to_user_termios(struct termios __user *u, struct ktermios *k) { return copy_to_user(u, k, sizeof(struct termios)); } #endif /* TCGETS2 */ /** * set_termios - set termios values for a tty * @tty: terminal device * @arg: user data * @opt: option information * * Helper function to prepare termios data and run necessary other functions * before using tty_set_termios() to do the actual changes. * * Locking: called functions take &tty_struct->ldisc_sem and * &tty_struct->termios_rwsem locks * * Returns: 0 on success, an error otherwise */ static int set_termios(struct tty_struct *tty, void __user *arg, int opt) { struct ktermios tmp_termios; struct tty_ldisc *ld; int retval = tty_check_change(tty); if (retval) return retval; down_read(&tty->termios_rwsem); tmp_termios = tty->termios; up_read(&tty->termios_rwsem); if (opt & TERMIOS_TERMIO) { if (user_termio_to_kernel_termios(&tmp_termios, (struct termio __user *)arg)) return -EFAULT; #ifdef TCGETS2 } else if (opt & TERMIOS_OLD) { if (user_termios_to_kernel_termios_1(&tmp_termios, (struct termios __user *)arg)) return -EFAULT; } else { if (user_termios_to_kernel_termios(&tmp_termios, (struct termios2 __user *)arg)) return -EFAULT; } #else } else if (user_termios_to_kernel_termios(&tmp_termios, (struct termios __user *)arg)) return -EFAULT; #endif /* If old style Bfoo values are used then load c_ispeed/c_ospeed * with the real speed so its unconditionally usable */ tmp_termios.c_ispeed = tty_termios_input_baud_rate(&tmp_termios); tmp_termios.c_ospeed = tty_termios_baud_rate(&tmp_termios); if (opt & (TERMIOS_FLUSH|TERMIOS_WAIT)) { retry_write_wait: retval = wait_event_interruptible(tty->write_wait, !tty_chars_in_buffer(tty)); if (retval < 0) return retval; if (tty_write_lock(tty, false) < 0) goto retry_write_wait; /* Racing writer? */ if (tty_chars_in_buffer(tty)) { tty_write_unlock(tty); goto retry_write_wait; } ld = tty_ldisc_ref(tty); if (ld != NULL) { if ((opt & TERMIOS_FLUSH) && ld->ops->flush_buffer) ld->ops->flush_buffer(tty); tty_ldisc_deref(ld); } if ((opt & TERMIOS_WAIT) && tty->ops->wait_until_sent) { tty->ops->wait_until_sent(tty, 0); if (signal_pending(current)) { tty_write_unlock(tty); return -ERESTARTSYS; } } tty_set_termios(tty, &tmp_termios); tty_write_unlock(tty); } else { tty_set_termios(tty, &tmp_termios); } /* FIXME: Arguably if tmp_termios == tty->termios AND the actual requested termios was not tmp_termios then we may want to return an error as no user requested change has succeeded */ return 0; } static void copy_termios(struct tty_struct *tty, struct ktermios *kterm) { down_read(&tty->termios_rwsem); *kterm = tty->termios; up_read(&tty->termios_rwsem); } static void copy_termios_locked(struct tty_struct *tty, struct ktermios *kterm) { down_read(&tty->termios_rwsem); *kterm = tty->termios_locked; up_read(&tty->termios_rwsem); } static int get_termio(struct tty_struct *tty, struct termio __user *termio) { struct ktermios kterm; copy_termios(tty, &kterm); if (kernel_termios_to_user_termio(termio, &kterm)) return -EFAULT; return 0; } #ifdef TIOCGETP /* * These are deprecated, but there is limited support.. * * The "sg_flags" translation is a joke.. */ static int get_sgflags(struct tty_struct *tty) { int flags = 0; if (!L_ICANON(tty)) { if (L_ISIG(tty)) flags |= 0x02; /* cbreak */ else flags |= 0x20; /* raw */ } if (L_ECHO(tty)) flags |= 0x08; /* echo */ if (O_OPOST(tty)) if (O_ONLCR(tty)) flags |= 0x10; /* crmod */ return flags; } static int get_sgttyb(struct tty_struct *tty, struct sgttyb __user *sgttyb) { struct sgttyb tmp; down_read(&tty->termios_rwsem); tmp.sg_ispeed = tty->termios.c_ispeed; tmp.sg_ospeed = tty->termios.c_ospeed; tmp.sg_erase = tty->termios.c_cc[VERASE]; tmp.sg_kill = tty->termios.c_cc[VKILL]; tmp.sg_flags = get_sgflags(tty); up_read(&tty->termios_rwsem); return copy_to_user(sgttyb, &tmp, sizeof(tmp)) ? -EFAULT : 0; } static void set_sgflags(struct ktermios *termios, int flags) { termios->c_iflag = ICRNL | IXON; termios->c_oflag = 0; termios->c_lflag = ISIG | ICANON; if (flags & 0x02) { /* cbreak */ termios->c_iflag = 0; termios->c_lflag &= ~ICANON; } if (flags & 0x08) { /* echo */ termios->c_lflag |= ECHO | ECHOE | ECHOK | ECHOCTL | ECHOKE | IEXTEN; } if (flags & 0x10) { /* crmod */ termios->c_oflag |= OPOST | ONLCR; } if (flags & 0x20) { /* raw */ termios->c_iflag = 0; termios->c_lflag &= ~(ISIG | ICANON); } if (!(termios->c_lflag & ICANON)) { termios->c_cc[VMIN] = 1; termios->c_cc[VTIME] = 0; } } /** * set_sgttyb - set legacy terminal values * @tty: tty structure * @sgttyb: pointer to old style terminal structure * * Updates a terminal from the legacy BSD style terminal information structure. * * Locking: &tty_struct->termios_rwsem * * Returns: 0 on success, an error otherwise */ static int set_sgttyb(struct tty_struct *tty, struct sgttyb __user *sgttyb) { int retval; struct sgttyb tmp; struct ktermios termios; retval = tty_check_change(tty); if (retval) return retval; if (copy_from_user(&tmp, sgttyb, sizeof(tmp))) return -EFAULT; down_write(&tty->termios_rwsem); termios = tty->termios; termios.c_cc[VERASE] = tmp.sg_erase; termios.c_cc[VKILL] = tmp.sg_kill; set_sgflags(&termios, tmp.sg_flags); /* Try and encode into Bfoo format */ tty_termios_encode_baud_rate(&termios, termios.c_ispeed, termios.c_ospeed); up_write(&tty->termios_rwsem); tty_set_termios(tty, &termios); return 0; } #endif #ifdef TIOCGETC static int get_tchars(struct tty_struct *tty, struct tchars __user *tchars) { struct tchars tmp; down_read(&tty->termios_rwsem); tmp.t_intrc = tty->termios.c_cc[VINTR]; tmp.t_quitc = tty->termios.c_cc[VQUIT]; tmp.t_startc = tty->termios.c_cc[VSTART]; tmp.t_stopc = tty->termios.c_cc[VSTOP]; tmp.t_eofc = tty->termios.c_cc[VEOF]; tmp.t_brkc = tty->termios.c_cc[VEOL2]; /* what is brkc anyway? */ up_read(&tty->termios_rwsem); return copy_to_user(tchars, &tmp, sizeof(tmp)) ? -EFAULT : 0; } static int set_tchars(struct tty_struct *tty, struct tchars __user *tchars) { struct tchars tmp; if (copy_from_user(&tmp, tchars, sizeof(tmp))) return -EFAULT; down_write(&tty->termios_rwsem); tty->termios.c_cc[VINTR] = tmp.t_intrc; tty->termios.c_cc[VQUIT] = tmp.t_quitc; tty->termios.c_cc[VSTART] = tmp.t_startc; tty->termios.c_cc[VSTOP] = tmp.t_stopc; tty->termios.c_cc[VEOF] = tmp.t_eofc; tty->termios.c_cc[VEOL2] = tmp.t_brkc; /* what is brkc anyway? */ up_write(&tty->termios_rwsem); return 0; } #endif #ifdef TIOCGLTC static int get_ltchars(struct tty_struct *tty, struct ltchars __user *ltchars) { struct ltchars tmp; down_read(&tty->termios_rwsem); tmp.t_suspc = tty->termios.c_cc[VSUSP]; /* what is dsuspc anyway? */ tmp.t_dsuspc = tty->termios.c_cc[VSUSP]; tmp.t_rprntc = tty->termios.c_cc[VREPRINT]; /* what is flushc anyway? */ tmp.t_flushc = tty->termios.c_cc[VEOL2]; tmp.t_werasc = tty->termios.c_cc[VWERASE]; tmp.t_lnextc = tty->termios.c_cc[VLNEXT]; up_read(&tty->termios_rwsem); return copy_to_user(ltchars, &tmp, sizeof(tmp)) ? -EFAULT : 0; } static int set_ltchars(struct tty_struct *tty, struct ltchars __user *ltchars) { struct ltchars tmp; if (copy_from_user(&tmp, ltchars, sizeof(tmp))) return -EFAULT; down_write(&tty->termios_rwsem); tty->termios.c_cc[VSUSP] = tmp.t_suspc; /* what is dsuspc anyway? */ tty->termios.c_cc[VEOL2] = tmp.t_dsuspc; tty->termios.c_cc[VREPRINT] = tmp.t_rprntc; /* what is flushc anyway? */ tty->termios.c_cc[VEOL2] = tmp.t_flushc; tty->termios.c_cc[VWERASE] = tmp.t_werasc; tty->termios.c_cc[VLNEXT] = tmp.t_lnextc; up_write(&tty->termios_rwsem); return 0; } #endif /** * tty_change_softcar - carrier change ioctl helper * @tty: tty to update * @enable: enable/disable %CLOCAL * * Perform a change to the %CLOCAL state and call into the driver layer to make * it visible. * * Locking: &tty_struct->termios_rwsem. * * Returns: 0 on success, an error otherwise */ static int tty_change_softcar(struct tty_struct *tty, bool enable) { int ret = 0; struct ktermios old; tcflag_t bit = enable ? CLOCAL : 0; down_write(&tty->termios_rwsem); old = tty->termios; tty->termios.c_cflag &= ~CLOCAL; tty->termios.c_cflag |= bit; if (tty->ops->set_termios) tty->ops->set_termios(tty, &old); if (C_CLOCAL(tty) != bit) ret = -EINVAL; up_write(&tty->termios_rwsem); return ret; } /** * tty_mode_ioctl - mode related ioctls * @tty: tty for the ioctl * @cmd: command * @arg: ioctl argument * * Perform non-line discipline specific mode control ioctls. This is designed * to be called by line disciplines to ensure they provide consistent mode * setting. */ int tty_mode_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct tty_struct *real_tty; void __user *p = (void __user *)arg; int ret = 0; struct ktermios kterm; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) real_tty = tty->link; else real_tty = tty; switch (cmd) { #ifdef TIOCGETP case TIOCGETP: return get_sgttyb(real_tty, (struct sgttyb __user *) arg); case TIOCSETP: case TIOCSETN: return set_sgttyb(real_tty, (struct sgttyb __user *) arg); #endif #ifdef TIOCGETC case TIOCGETC: return get_tchars(real_tty, p); case TIOCSETC: return set_tchars(real_tty, p); #endif #ifdef TIOCGLTC case TIOCGLTC: return get_ltchars(real_tty, p); case TIOCSLTC: return set_ltchars(real_tty, p); #endif case TCSETSF: return set_termios(real_tty, p, TERMIOS_FLUSH | TERMIOS_WAIT | TERMIOS_OLD); case TCSETSW: return set_termios(real_tty, p, TERMIOS_WAIT | TERMIOS_OLD); case TCSETS: return set_termios(real_tty, p, TERMIOS_OLD); #ifndef TCGETS2 case TCGETS: copy_termios(real_tty, &kterm); if (kernel_termios_to_user_termios((struct termios __user *)arg, &kterm)) ret = -EFAULT; return ret; #else case TCGETS: copy_termios(real_tty, &kterm); if (kernel_termios_to_user_termios_1((struct termios __user *)arg, &kterm)) ret = -EFAULT; return ret; case TCGETS2: copy_termios(real_tty, &kterm); if (kernel_termios_to_user_termios((struct termios2 __user *)arg, &kterm)) ret = -EFAULT; return ret; case TCSETSF2: return set_termios(real_tty, p, TERMIOS_FLUSH | TERMIOS_WAIT); case TCSETSW2: return set_termios(real_tty, p, TERMIOS_WAIT); case TCSETS2: return set_termios(real_tty, p, 0); #endif case TCGETA: return get_termio(real_tty, p); case TCSETAF: return set_termios(real_tty, p, TERMIOS_FLUSH | TERMIOS_WAIT | TERMIOS_TERMIO); case TCSETAW: return set_termios(real_tty, p, TERMIOS_WAIT | TERMIOS_TERMIO); case TCSETA: return set_termios(real_tty, p, TERMIOS_TERMIO); #ifndef TCGETS2 case TIOCGLCKTRMIOS: copy_termios_locked(real_tty, &kterm); if (kernel_termios_to_user_termios((struct termios __user *)arg, &kterm)) ret = -EFAULT; return ret; case TIOCSLCKTRMIOS: if (!checkpoint_restore_ns_capable(&init_user_ns)) return -EPERM; copy_termios_locked(real_tty, &kterm); if (user_termios_to_kernel_termios(&kterm, (struct termios __user *) arg)) return -EFAULT; down_write(&real_tty->termios_rwsem); real_tty->termios_locked = kterm; up_write(&real_tty->termios_rwsem); return 0; #else case TIOCGLCKTRMIOS: copy_termios_locked(real_tty, &kterm); if (kernel_termios_to_user_termios_1((struct termios __user *)arg, &kterm)) ret = -EFAULT; return ret; case TIOCSLCKTRMIOS: if (!checkpoint_restore_ns_capable(&init_user_ns)) return -EPERM; copy_termios_locked(real_tty, &kterm); if (user_termios_to_kernel_termios_1(&kterm, (struct termios __user *) arg)) return -EFAULT; down_write(&real_tty->termios_rwsem); real_tty->termios_locked = kterm; up_write(&real_tty->termios_rwsem); return ret; #endif #ifdef TCGETX case TCGETX: case TCSETX: case TCSETXW: case TCSETXF: return -ENOTTY; #endif case TIOCGSOFTCAR: copy_termios(real_tty, &kterm); ret = put_user((kterm.c_cflag & CLOCAL) ? 1 : 0, (int __user *)arg); return ret; case TIOCSSOFTCAR: if (get_user(arg, (unsigned int __user *) arg)) return -EFAULT; return tty_change_softcar(real_tty, arg); default: return -ENOIOCTLCMD; } } EXPORT_SYMBOL_GPL(tty_mode_ioctl); /* Caller guarantees ldisc reference is held */ static int __tty_perform_flush(struct tty_struct *tty, unsigned long arg) { struct tty_ldisc *ld = tty->ldisc; switch (arg) { case TCIFLUSH: if (ld && ld->ops->flush_buffer) { ld->ops->flush_buffer(tty); tty_unthrottle(tty); } break; case TCIOFLUSH: if (ld && ld->ops->flush_buffer) { ld->ops->flush_buffer(tty); tty_unthrottle(tty); } fallthrough; case TCOFLUSH: tty_driver_flush_buffer(tty); break; default: return -EINVAL; } return 0; } int tty_perform_flush(struct tty_struct *tty, unsigned long arg) { struct tty_ldisc *ld; int retval = tty_check_change(tty); if (retval) return retval; ld = tty_ldisc_ref_wait(tty); retval = __tty_perform_flush(tty, arg); if (ld) tty_ldisc_deref(ld); return retval; } EXPORT_SYMBOL_GPL(tty_perform_flush); int n_tty_ioctl_helper(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { int retval; switch (cmd) { case TCXONC: retval = tty_check_change(tty); if (retval) return retval; switch (arg) { case TCOOFF: spin_lock_irq(&tty->flow.lock); if (!tty->flow.tco_stopped) { tty->flow.tco_stopped = true; __stop_tty(tty); } spin_unlock_irq(&tty->flow.lock); break; case TCOON: spin_lock_irq(&tty->flow.lock); if (tty->flow.tco_stopped) { tty->flow.tco_stopped = false; __start_tty(tty); } spin_unlock_irq(&tty->flow.lock); break; case TCIOFF: if (STOP_CHAR(tty) != __DISABLED_CHAR) retval = tty_send_xchar(tty, STOP_CHAR(tty)); break; case TCION: if (START_CHAR(tty) != __DISABLED_CHAR) retval = tty_send_xchar(tty, START_CHAR(tty)); break; default: return -EINVAL; } return retval; case TCFLSH: retval = tty_check_change(tty); if (retval) return retval; return __tty_perform_flush(tty, arg); default: /* Try the mode commands */ return tty_mode_ioctl(tty, cmd, arg); } } EXPORT_SYMBOL(n_tty_ioctl_helper);
162 161 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_ENTRYKVM_H #define __LINUX_ENTRYKVM_H #include <linux/static_call_types.h> #include <linux/resume_user_mode.h> #include <linux/syscalls.h> #include <linux/seccomp.h> #include <linux/sched.h> #include <linux/tick.h> /* Transfer to guest mode work */ #ifdef CONFIG_KVM_XFER_TO_GUEST_WORK #ifndef ARCH_XFER_TO_GUEST_MODE_WORK # define ARCH_XFER_TO_GUEST_MODE_WORK (0) #endif #define XFER_TO_GUEST_MODE_WORK \ (_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \ _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK) struct kvm_vcpu; /** * arch_xfer_to_guest_mode_handle_work - Architecture specific xfer to guest * mode work handling function. * @vcpu: Pointer to current's VCPU data * @ti_work: Cached TIF flags gathered in xfer_to_guest_mode_handle_work() * * Invoked from xfer_to_guest_mode_handle_work(). Defaults to NOOP. Can be * replaced by architecture specific code. */ static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, unsigned long ti_work); #ifndef arch_xfer_to_guest_mode_work static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, unsigned long ti_work) { return 0; } #endif /** * xfer_to_guest_mode_handle_work - Check and handle pending work which needs * to be handled before going to guest mode * @vcpu: Pointer to current's VCPU data * * Returns: 0 or an error code */ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu); /** * xfer_to_guest_mode_prepare - Perform last minute preparation work that * need to be handled while IRQs are disabled * upon entering to guest. * * Has to be invoked with interrupts disabled before the last call * to xfer_to_guest_mode_work_pending(). */ static inline void xfer_to_guest_mode_prepare(void) { lockdep_assert_irqs_disabled(); tick_nohz_user_enter_prepare(); } /** * __xfer_to_guest_mode_work_pending - Check if work is pending * * Returns: True if work pending, False otherwise. * * Bare variant of xfer_to_guest_mode_work_pending(). Can be called from * interrupt enabled code for racy quick checks with care. */ static inline bool __xfer_to_guest_mode_work_pending(void) { unsigned long ti_work = read_thread_flags(); return !!(ti_work & XFER_TO_GUEST_MODE_WORK); } /** * xfer_to_guest_mode_work_pending - Check if work is pending which needs to be * handled before returning to guest mode * * Returns: True if work pending, False otherwise. * * Has to be invoked with interrupts disabled before the transition to * guest mode. */ static inline bool xfer_to_guest_mode_work_pending(void) { lockdep_assert_irqs_disabled(); return __xfer_to_guest_mode_work_pending(); } #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ #endif
7 2 3 3 1 2 1 2 3 52 39 13 52 152 768 3 46 49 49 2 11 19 33 1 32 2 1 27 20 2 18 1 13 5 63 63 63 10 3 4 46 45 1 55 38 3 8 44 24 907 913 913 162 750 65 2 3 2 3 2 2 2 2 2 2 1 2 3 1 3 6 3 1 3 1 2 1 4 2 2 1 5 1 2 1 2 5 2 1 1 1 2 2 1 1 1 605 152 480 3 3 2 5 4 2 1 3 3 2 3 1 2 2 3 2 1 2 5 2 3 2 2 1 1 1 2 2 4 1 2 4 3 3 4 2 2 3 1 2 1 3 3 1 4 2 5 2 4 2 4 4 4 2 1 23 7 3 1 3 2 1 29 7 20 33 1 5 105 1 32 1 605 150 974 7 57 624 289 11 2 2 3 2 2 4 5 8 1 8 1 3 4 163 165 12 153 41 5 3 1 9 2 1 31 9 29 11 28 12 27 13 28 13 33 7 1 1 1 1 1 2 1 1 1 1 11 1 1 1 1 1 1 1 1 1 1 3 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 3 3 1 1 3 1 1 1 41 42 185 3 20 131 22 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 BSD socket options interface * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/net/ipv4/ip_sockglue.c * * FIXME: Make the setsockopt code POSIX compliant: That is * * o Truncate getsockopt returns * o Return an optlen of the truncated length if need be * * Changes: * David L Stevens <dlstevens@us.ibm.com>: * - added multicast source filtering API for MLDv2 */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/mroute6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/netfilter.h> #include <linux/slab.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/udp.h> #include <net/udplite.h> #include <net/xfrm.h> #include <net/compat.h> #include <net/seg6.h> #include <linux/uaccess.h> struct ip6_ra_chain *ip6_ra_chain; DEFINE_RWLOCK(ip6_ra_lock); DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount); int ip6_ra_control(struct sock *sk, int sel) { struct ip6_ra_chain *ra, *new_ra, **rap; /* RA packet may be delivered ONLY to IPPROTO_RAW socket */ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW) return -ENOPROTOOPT; new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; if (sel >= 0 && !new_ra) return -ENOMEM; write_lock_bh(&ip6_ra_lock); for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (sel >= 0) { write_unlock_bh(&ip6_ra_lock); kfree(new_ra); return -EADDRINUSE; } *rap = ra->next; write_unlock_bh(&ip6_ra_lock); sock_put(sk); kfree(ra); return 0; } } if (!new_ra) { write_unlock_bh(&ip6_ra_lock); return -ENOBUFS; } new_ra->sk = sk; new_ra->sel = sel; new_ra->next = ra; *rap = new_ra; sock_hold(sk); write_unlock_bh(&ip6_ra_lock); return 0; } struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt) { if (inet_test_bit(IS_ICSK, sk)) { if (opt && !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt, opt); sk_dst_reset(sk); return opt; } static bool setsockopt_needs_rtnl(int optname) { switch (optname) { case IPV6_ADDRFORM: case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: case IPV6_JOIN_ANYCAST: case IPV6_LEAVE_ANYCAST: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: case MCAST_MSFILTER: return true; } return false; } static int copy_group_source_from_sockptr(struct group_source_req *greqs, sockptr_t optval, int optlen) { if (in_compat_syscall()) { struct compat_group_source_req gr32; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; greqs->gsr_interface = gr32.gsr_interface; greqs->gsr_group = gr32.gsr_group; greqs->gsr_source = gr32.gsr_source; } else { if (optlen < sizeof(*greqs)) return -EINVAL; if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) return -EFAULT; } return 0; } static int do_ipv6_mcast_group_source(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct group_source_req greqs; int omode, add; int ret; ret = copy_group_source_from_sockptr(&greqs, optval, optlen); if (ret) return ret; if (greqs.gsr_group.ss_family != AF_INET6 || greqs.gsr_source.ss_family != AF_INET6) return -EADDRNOTAVAIL; if (optname == MCAST_BLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 1; } else if (optname == MCAST_UNBLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 0; } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct sockaddr_in6 *psin6; int retv; psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface, &psin6->sin6_addr, MCAST_INCLUDE); /* prior join w/ different source is ok */ if (retv && retv != -EADDRINUSE) return retv; omode = MCAST_INCLUDE; add = 1; } else /* MCAST_LEAVE_SOURCE_GROUP */ { omode = MCAST_INCLUDE; add = 0; } return ip6_mc_source(add, omode, sk, &greqs); } static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { struct group_filter *gsf; int ret; if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); if (IS_ERR(gsf)) return PTR_ERR(gsf); /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; if (gsf->gf_numsrc >= 0x1ffffffU || gsf->gf_numsrc > sysctl_mld_max_msf) goto out_free_gsf; ret = -EINVAL; if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) goto out_free_gsf; ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist_flex); out_free_gsf: kfree(gsf); return ret; } static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter *gf32; void *p; int ret; int n; if (optlen < size0) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); if (!p) return -ENOMEM; gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */ ret = -EFAULT; if (copy_from_sockptr(gf32, optval, optlen)) goto out_free_p; /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; n = gf32->gf_numsrc; if (n >= 0x1ffffffU || n > sysctl_mld_max_msf) goto out_free_p; ret = -EINVAL; if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen) goto out_free_p; ret = ip6_mc_msfilter(sk, &(struct group_filter){ .gf_interface = gf32->gf_interface, .gf_group = gf32->gf_group, .gf_fmode = gf32->gf_fmode, .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist_flex); out_free_p: kfree(p); return ret; } static int ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct sockaddr_in6 *psin6; struct group_req greq; if (optlen < sizeof(greq)) return -EINVAL; if (copy_from_sockptr(&greq, optval, sizeof(greq))) return -EFAULT; if (greq.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&greq.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, greq.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, greq.gr_interface, &psin6->sin6_addr); } static int compat_ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct compat_group_req gr32; struct sockaddr_in6 *psin6; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; if (gr32.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&gr32.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, gr32.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, gr32.gr_interface, &psin6->sin6_addr); } static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_opt_hdr *new = NULL; struct net *net = sock_net(sk); struct ipv6_txoptions *opt; int err; /* hop-by-hop / destination options are privileged option */ if (optname != IPV6_RTHDR && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; /* remove any sticky options header with a zero option * length, per RFC3542. */ if (optlen > 0) { if (sockptr_is_null(optval)) return -EINVAL; if (optlen < sizeof(struct ipv6_opt_hdr) || optlen & 0x7 || optlen > 8 * 255) return -EINVAL; new = memdup_sockptr(optval, optlen); if (IS_ERR(new)) return PTR_ERR(new); if (unlikely(ipv6_optlen(new) > optlen)) { kfree(new); return -EINVAL; } } opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); opt = ipv6_renew_options(sk, opt, optname, new); kfree(new); if (IS_ERR(opt)) return PTR_ERR(opt); /* routing header option needs extra check */ err = -EINVAL; if (optname == IPV6_RTHDR && opt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = opt->srcrt; switch (rthdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) goto sticky_done; break; #endif case IPV6_SRCRT_TYPE_4: { struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt; if (!seg6_validate_srh(srh, optlen, false)) goto sticky_done; break; } default: goto sticky_done; } } err = 0; opt = ipv6_update_options(sk, opt); sticky_done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } return err; } int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int val, valbool; int retv = -ENOPROTOOPT; bool needs_rtnl = setsockopt_needs_rtnl(optname); if (sockptr_is_null(optval)) val = 0; else { if (optlen >= sizeof(int)) { if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else val = 0; } valbool = (val != 0); if (ip6_mroute_opt(optname)) return ip6_mroute_setsockopt(sk, optname, optval, optlen); /* Handle options that can be set without locking the socket. */ switch (optname) { case IPV6_UNICAST_HOPS: if (optlen < sizeof(int)) return -EINVAL; if (val > 255 || val < -1) return -EINVAL; WRITE_ONCE(np->hop_limit, val); return 0; case IPV6_MULTICAST_LOOP: if (optlen < sizeof(int)) return -EINVAL; if (val != valbool) return -EINVAL; inet6_assign_bit(MC6_LOOP, sk, valbool); return 0; case IPV6_MULTICAST_HOPS: if (sk->sk_type == SOCK_STREAM) return retv; if (optlen < sizeof(int)) return -EINVAL; if (val > 255 || val < -1) return -EINVAL; WRITE_ONCE(np->mcast_hops, val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); return 0; case IPV6_MTU: if (optlen < sizeof(int)) return -EINVAL; if (val && val < IPV6_MIN_MTU) return -EINVAL; WRITE_ONCE(np->frag_size, val); return 0; case IPV6_MINHOPCOUNT: if (optlen < sizeof(int)) return -EINVAL; if (val < 0 || val > 255) return -EINVAL; if (val) static_branch_enable(&ip6_min_hopcount); /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount * while we are changing it. */ WRITE_ONCE(np->min_hopcount, val); return 0; case IPV6_RECVERR_RFC4884: if (optlen < sizeof(int)) return -EINVAL; if (val < 0 || val > 1) return -EINVAL; inet6_assign_bit(RECVERR6_RFC4884, sk, valbool); return 0; case IPV6_MULTICAST_ALL: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(MC6_ALL, sk, valbool); return 0; case IPV6_AUTOFLOWLABEL: inet6_assign_bit(AUTOFLOWLABEL, sk, valbool); inet6_set_bit(AUTOFLOWLABEL_SET, sk); return 0; case IPV6_DONTFRAG: inet6_assign_bit(DONTFRAG, sk, valbool); return 0; case IPV6_RECVERR: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(RECVERR6, sk, valbool); if (!val) skb_errqueue_purge(&sk->sk_error_queue); return 0; case IPV6_ROUTER_ALERT_ISOLATE: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(RTALERT_ISOLATE, sk, valbool); return 0; case IPV6_MTU_DISCOVER: if (optlen < sizeof(int)) return -EINVAL; if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT) return -EINVAL; WRITE_ONCE(np->pmtudisc, val); return 0; case IPV6_FLOWINFO_SEND: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(SNDFLOW, sk, valbool); return 0; case IPV6_ADDR_PREFERENCES: if (optlen < sizeof(int)) return -EINVAL; return ip6_sock_set_addr_preferences(sk, val); case IPV6_MULTICAST_IF: if (sk->sk_type == SOCK_STREAM) return -ENOPROTOOPT; if (optlen < sizeof(int)) return -EINVAL; if (val) { struct net_device *dev; int bound_dev_if, midx; rcu_read_lock(); dev = dev_get_by_index_rcu(net, val); if (!dev) { rcu_read_unlock(); return -ENODEV; } midx = l3mdev_master_ifindex_rcu(dev); rcu_read_unlock(); bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && bound_dev_if != val && (!midx || midx != bound_dev_if)) return -EINVAL; } WRITE_ONCE(np->mcast_oif, val); return 0; case IPV6_UNICAST_IF: { struct net_device *dev; int ifindex; if (optlen != sizeof(int)) return -EINVAL; ifindex = (__force int)ntohl((__force __be32)val); if (!ifindex) { WRITE_ONCE(np->ucast_oif, 0); return 0; } dev = dev_get_by_index(net, ifindex); if (!dev) return -EADDRNOTAVAIL; dev_put(dev); if (READ_ONCE(sk->sk_bound_dev_if)) return -EINVAL; WRITE_ONCE(np->ucast_oif, ifindex); return 0; } } if (needs_rtnl) rtnl_lock(); sockopt_lock_sock(sk); /* Another thread has converted the socket into IPv4 with * IPV6_ADDRFORM concurrently. */ if (unlikely(sk->sk_family != AF_INET6)) goto unlock; switch (optname) { case IPV6_ADDRFORM: if (optlen < sizeof(int)) goto e_inval; if (val == PF_INET) { if (sk->sk_type == SOCK_RAW) break; if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) { struct udp_sock *up = udp_sk(sk); if (up->pending == AF_INET6) { retv = -EBUSY; break; } } else if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_prot != &tcpv6_prot) { retv = -EBUSY; break; } } else { break; } if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; } if (ipv6_only_sock(sk) || !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { retv = -EADDRNOTAVAIL; break; } __ipv6_sock_mc_close(sk); __ipv6_sock_ac_close(sk); if (sk->sk_protocol == IPPROTO_TCP) { struct inet_connection_sock *icsk = inet_csk(sk); sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, &tcp_prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */ WRITE_ONCE(sk->sk_prot, &tcp_prot); /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); WRITE_ONCE(sk->sk_socket->ops, &inet_stream_ops); WRITE_ONCE(sk->sk_family, PF_INET); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct proto *prot = &udp_prot; if (sk->sk_protocol == IPPROTO_UDPLITE) prot = &udplite_prot; sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); WRITE_ONCE(sk->sk_socket->ops, &inet_dgram_ops); WRITE_ONCE(sk->sk_family, PF_INET); } /* Disable all options not to allocate memory anymore, * but there is still a race. See the lockless path * in udpv6_sendmsg() and ipv6_local_rxpmtu(). */ np->rxopt.all = 0; inet6_cleanup_sock(sk); module_put(THIS_MODULE); retv = 0; break; } goto e_inval; case IPV6_V6ONLY: if (optlen < sizeof(int) || inet_sk(sk)->inet_num) goto e_inval; sk->sk_ipv6only = valbool; retv = 0; break; case IPV6_RECVPKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxinfo = valbool; retv = 0; break; case IPV6_2292PKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxoinfo = valbool; retv = 0; break; case IPV6_RECVHOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxhlim = valbool; retv = 0; break; case IPV6_2292HOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxohlim = valbool; retv = 0; break; case IPV6_RECVRTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.srcrt = valbool; retv = 0; break; case IPV6_2292RTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.osrcrt = valbool; retv = 0; break; case IPV6_RECVHOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.hopopts = valbool; retv = 0; break; case IPV6_2292HOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.ohopopts = valbool; retv = 0; break; case IPV6_RECVDSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.dstopts = valbool; retv = 0; break; case IPV6_2292DSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.odstopts = valbool; retv = 0; break; case IPV6_TCLASS: if (optlen < sizeof(int)) goto e_inval; if (val < -1 || val > 0xff) goto e_inval; /* RFC 3542, 6.5: default traffic class of 0x0 */ if (val == -1) val = 0; if (sk->sk_type == SOCK_STREAM) { val &= ~INET_ECN_MASK; val |= np->tclass & INET_ECN_MASK; } if (np->tclass != val) { np->tclass = val; sk_dst_reset(sk); } retv = 0; break; case IPV6_RECVTCLASS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxtclass = valbool; retv = 0; break; case IPV6_FLOWINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxflow = valbool; retv = 0; break; case IPV6_RECVPATHMTU: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxpmtu = valbool; retv = 0; break; case IPV6_TRANSPARENT: if (valbool && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) { retv = -EPERM; break; } if (optlen < sizeof(int)) goto e_inval; /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ inet_assign_bit(TRANSPARENT, sk, valbool); retv = 0; break; case IPV6_FREEBIND: if (optlen < sizeof(int)) goto e_inval; /* we also don't have a separate freebind bit for IPV6 */ inet_assign_bit(FREEBIND, sk, valbool); retv = 0; break; case IPV6_RECVORIGDSTADDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxorigdstaddr = valbool; retv = 0; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: retv = ipv6_set_opt_hdr(sk, optname, optval, optlen); break; case IPV6_PKTINFO: { struct in6_pktinfo pkt; if (optlen == 0) goto e_inval; else if (optlen < sizeof(struct in6_pktinfo) || sockptr_is_null(optval)) goto e_inval; if (copy_from_sockptr(&pkt, optval, sizeof(pkt))) { retv = -EFAULT; break; } if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; np->sticky_pktinfo.ipi6_addr = pkt.ipi6_addr; retv = 0; break; } case IPV6_2292PKTOPTIONS: { struct ipv6_txoptions *opt = NULL; struct msghdr msg; struct flowi6 fl6; struct ipcm6_cookie ipc6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; if (optlen == 0) goto update; /* 1K is probably excessive * 1K is surely not enough, 2K per standard header is 16K. */ retv = -EINVAL; if (optlen > 64*1024) break; opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL); retv = -ENOBUFS; if (!opt) break; memset(opt, 0, sizeof(*opt)); refcount_set(&opt->refcnt, 1); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; if (copy_from_sockptr(opt + 1, optval, optlen)) goto done; msg.msg_controllen = optlen; msg.msg_control_is_user = false; msg.msg_control = (void *)(opt+1); ipc6.opt = opt; retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6); if (retv) goto done; update: retv = 0; opt = ipv6_update_options(sk, opt); done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } break; } case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: { struct ipv6_mreq mreq; if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EPROTO; if (inet_test_bit(IS_ICSK, sk)) break; retv = -EFAULT; if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_ADD_MEMBERSHIP) retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); else retv = ipv6_sock_mc_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); break; } case IPV6_JOIN_ANYCAST: case IPV6_LEAVE_ANYCAST: { struct ipv6_mreq mreq; if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EFAULT; if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_JOIN_ANYCAST) retv = ipv6_sock_ac_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); else retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); break; } case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: if (in_compat_syscall()) retv = compat_ipv6_mcast_join_leave(sk, optname, optval, optlen); else retv = ipv6_mcast_join_leave(sk, optname, optval, optlen); break; case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: retv = do_ipv6_mcast_group_source(sk, optname, optval, optlen); break; case MCAST_MSFILTER: if (in_compat_syscall()) retv = compat_ipv6_set_mcast_msfilter(sk, optval, optlen); else retv = ipv6_set_mcast_msfilter(sk, optval, optlen); break; case IPV6_ROUTER_ALERT: if (optlen < sizeof(int)) goto e_inval; retv = ip6_ra_control(sk, val); break; case IPV6_FLOWLABEL_MGR: retv = ipv6_flowlabel_opt(sk, optval, optlen); break; case IPV6_IPSEC_POLICY: case IPV6_XFRM_POLICY: retv = -EPERM; if (!sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) break; retv = xfrm_user_policy(sk, optname, optval, optlen); break; case IPV6_RECVFRAGSIZE: np->rxopt.bits.recvfragsize = valbool; retv = 0; break; } unlock: sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return retv; e_inval: retv = -EINVAL; goto unlock; } int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) return udp_prot.setsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && optname != IPV6_XFRM_POLICY) err = nf_setsockopt(sk, PF_INET6, optname, optval, optlen); #endif return err; } EXPORT_SYMBOL(ipv6_setsockopt); static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, int optname, sockptr_t optval, int len) { struct ipv6_opt_hdr *hdr; if (!opt) return 0; switch (optname) { case IPV6_HOPOPTS: hdr = opt->hopopt; break; case IPV6_RTHDRDSTOPTS: hdr = opt->dst0opt; break; case IPV6_RTHDR: hdr = (struct ipv6_opt_hdr *)opt->srcrt; break; case IPV6_DSTOPTS: hdr = opt->dst1opt; break; default: return -EINVAL; /* should not happen */ } if (!hdr) return 0; len = min_t(unsigned int, len, ipv6_optlen(hdr)); if (copy_to_sockptr(optval, hdr, len)) return -EFAULT; return len; } static int ipv6_get_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct group_filter, gf_slist_flex); struct group_filter gsf; int num; int err; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gsf, optval, size0)) return -EFAULT; if (gsf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; num = gsf.gf_numsrc; sockopt_lock_sock(sk); err = ip6_mc_msfget(sk, &gsf, optval, size0); if (!err) { if (num > gsf.gf_numsrc) num = gsf.gf_numsrc; len = GROUP_FILTER_SIZE(num); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr(optval, &gsf, size0)) err = -EFAULT; } sockopt_release_sock(sk); return err; } static int compat_ipv6_get_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter gf32; struct group_filter gf; int err; int num; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gf32, optval, size0)) return -EFAULT; gf.gf_interface = gf32.gf_interface; gf.gf_fmode = gf32.gf_fmode; num = gf.gf_numsrc = gf32.gf_numsrc; gf.gf_group = gf32.gf_group; if (gf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; sockopt_lock_sock(sk); err = ip6_mc_msfget(sk, &gf, optval, size0); sockopt_release_sock(sk); if (err) return err; if (num > gf.gf_numsrc) num = gf.gf_numsrc; len = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32)); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode), &gf.gf_fmode, sizeof(gf32.gf_fmode)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc), &gf.gf_numsrc, sizeof(gf32.gf_numsrc))) return -EFAULT; return 0; } int do_ipv6_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct ipv6_pinfo *np = inet6_sk(sk); int len; int val; if (ip6_mroute_opt(optname)) return ip6_mroute_getsockopt(sk, optname, optval, optlen); if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; switch (optname) { case IPV6_ADDRFORM: if (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_UDPLITE && sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; val = sk->sk_family; break; case MCAST_MSFILTER: if (in_compat_syscall()) return compat_ipv6_get_msfilter(sk, optval, optlen, len); return ipv6_get_msfilter(sk, optval, optlen, len); case IPV6_2292PKTOPTIONS: { struct msghdr msg; struct sk_buff *skb; if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; if (optval.is_kernel) { msg.msg_control_is_user = false; msg.msg_control = optval.kernel; } else { msg.msg_control_is_user = true; msg.msg_control_user = optval.user; } msg.msg_controllen = len; msg.msg_flags = 0; sockopt_lock_sock(sk); skb = np->pktoptions; if (skb) ip6_datagram_recv_ctl(sk, &msg, skb); sockopt_release_sock(sk); if (!skb) { if (np->rxopt.bits.rxinfo) { int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxhlim) { int hlim = READ_ONCE(np->mcast_hops); put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { int tclass = (int)ip6_tclass(np->rcv_flowinfo); put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } if (np->rxopt.bits.rxoinfo) { int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { int hlim = READ_ONCE(np->mcast_hops); put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxflow) { __be32 flowinfo = np->rcv_flowinfo; put_cmsg(&msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } } len -= msg.msg_controllen; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_MTU: { struct dst_entry *dst; val = 0; rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) val = dst_mtu(dst); rcu_read_unlock(); if (!val) return -ENOTCONN; break; } case IPV6_V6ONLY: val = sk->sk_ipv6only; break; case IPV6_RECVPKTINFO: val = np->rxopt.bits.rxinfo; break; case IPV6_2292PKTINFO: val = np->rxopt.bits.rxoinfo; break; case IPV6_RECVHOPLIMIT: val = np->rxopt.bits.rxhlim; break; case IPV6_2292HOPLIMIT: val = np->rxopt.bits.rxohlim; break; case IPV6_RECVRTHDR: val = np->rxopt.bits.srcrt; break; case IPV6_2292RTHDR: val = np->rxopt.bits.osrcrt; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: { struct ipv6_txoptions *opt; sockopt_lock_sock(sk); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len); sockopt_release_sock(sk); /* check if ipv6_getsockopt_sticky() returns err code */ if (len < 0) return len; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_RECVHOPOPTS: val = np->rxopt.bits.hopopts; break; case IPV6_2292HOPOPTS: val = np->rxopt.bits.ohopopts; break; case IPV6_RECVDSTOPTS: val = np->rxopt.bits.dstopts; break; case IPV6_2292DSTOPTS: val = np->rxopt.bits.odstopts; break; case IPV6_TCLASS: val = np->tclass; break; case IPV6_RECVTCLASS: val = np->rxopt.bits.rxtclass; break; case IPV6_FLOWINFO: val = np->rxopt.bits.rxflow; break; case IPV6_RECVPATHMTU: val = np->rxopt.bits.rxpmtu; break; case IPV6_PATHMTU: { struct dst_entry *dst; struct ip6_mtuinfo mtuinfo; if (len < sizeof(mtuinfo)) return -EINVAL; len = sizeof(mtuinfo); memset(&mtuinfo, 0, sizeof(mtuinfo)); rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) mtuinfo.ip6m_mtu = dst_mtu(dst); rcu_read_unlock(); if (!mtuinfo.ip6m_mtu) return -ENOTCONN; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &mtuinfo, len)) return -EFAULT; return 0; } case IPV6_TRANSPARENT: val = inet_test_bit(TRANSPARENT, sk); break; case IPV6_FREEBIND: val = inet_test_bit(FREEBIND, sk); break; case IPV6_RECVORIGDSTADDR: val = np->rxopt.bits.rxorigdstaddr; break; case IPV6_UNICAST_HOPS: case IPV6_MULTICAST_HOPS: { struct dst_entry *dst; if (optname == IPV6_UNICAST_HOPS) val = READ_ONCE(np->hop_limit); else val = READ_ONCE(np->mcast_hops); if (val < 0) { rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) val = ip6_dst_hoplimit(dst); rcu_read_unlock(); } if (val < 0) val = sock_net(sk)->ipv6.devconf_all->hop_limit; break; } case IPV6_MULTICAST_LOOP: val = inet6_test_bit(MC6_LOOP, sk); break; case IPV6_MULTICAST_IF: val = READ_ONCE(np->mcast_oif); break; case IPV6_MULTICAST_ALL: val = inet6_test_bit(MC6_ALL, sk); break; case IPV6_UNICAST_IF: val = (__force int)htonl((__u32) READ_ONCE(np->ucast_oif)); break; case IPV6_MTU_DISCOVER: val = READ_ONCE(np->pmtudisc); break; case IPV6_RECVERR: val = inet6_test_bit(RECVERR6, sk); break; case IPV6_FLOWINFO_SEND: val = inet6_test_bit(SNDFLOW, sk); break; case IPV6_FLOWLABEL_MGR: { struct in6_flowlabel_req freq; int flags; if (len < sizeof(freq)) return -EINVAL; if (copy_from_sockptr(&freq, optval, sizeof(freq))) return -EFAULT; if (freq.flr_action != IPV6_FL_A_GET) return -EINVAL; len = sizeof(freq); flags = freq.flr_flags; memset(&freq, 0, sizeof(freq)); val = ipv6_flowlabel_opt_get(sk, &freq, flags); if (val < 0) return val; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &freq, len)) return -EFAULT; return 0; } case IPV6_ADDR_PREFERENCES: { u8 srcprefs = READ_ONCE(np->srcprefs); val = 0; if (srcprefs & IPV6_PREFER_SRC_TMP) val |= IPV6_PREFER_SRC_TMP; else if (srcprefs & IPV6_PREFER_SRC_PUBLIC) val |= IPV6_PREFER_SRC_PUBLIC; else { /* XXX: should we return system default? */ val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT; } if (srcprefs & IPV6_PREFER_SRC_COA) val |= IPV6_PREFER_SRC_COA; else val |= IPV6_PREFER_SRC_HOME; break; } case IPV6_MINHOPCOUNT: val = READ_ONCE(np->min_hopcount); break; case IPV6_DONTFRAG: val = inet6_test_bit(DONTFRAG, sk); break; case IPV6_AUTOFLOWLABEL: val = ip6_autoflowlabel(sock_net(sk), sk); break; case IPV6_RECVFRAGSIZE: val = np->rxopt.bits.recvfragsize; break; case IPV6_ROUTER_ALERT_ISOLATE: val = inet6_test_bit(RTALERT_ISOLATE, sk); break; case IPV6_RECVERR_RFC4884: val = inet6_test_bit(RECVERR6_RFC4884, sk); break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, sizeof(int), len); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) return udp_prot.getsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { int len; if (get_user(len, optlen)) return -EFAULT; err = nf_getsockopt(sk, PF_INET6, optname, optval, &len); if (err >= 0) err = put_user(len, optlen); } #endif return err; } EXPORT_SYMBOL(ipv6_getsockopt);
32 32 101 101 43 43 48 48 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 // SPDX-License-Identifier: GPL-2.0 /* * f2fs compress support * * Copyright (c) 2019 Chao Yu <chao@kernel.org> */ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/moduleparam.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/lzo.h> #include <linux/lz4.h> #include <linux/zstd.h> #include <linux/pagevec.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include <trace/events/f2fs.h> static struct kmem_cache *cic_entry_slab; static struct kmem_cache *dic_entry_slab; static void *page_array_alloc(struct inode *inode, int nr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int size = sizeof(struct page *) * nr; if (likely(size <= sbi->page_array_slab_size)) return f2fs_kmem_cache_alloc(sbi->page_array_slab, GFP_F2FS_ZERO, false, F2FS_I_SB(inode)); return f2fs_kzalloc(sbi, size, GFP_NOFS); } static void page_array_free(struct inode *inode, void *pages, int nr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int size = sizeof(struct page *) * nr; if (!pages) return; if (likely(size <= sbi->page_array_slab_size)) kmem_cache_free(sbi->page_array_slab, pages); else kfree(pages); } struct f2fs_compress_ops { int (*init_compress_ctx)(struct compress_ctx *cc); void (*destroy_compress_ctx)(struct compress_ctx *cc); int (*compress_pages)(struct compress_ctx *cc); int (*init_decompress_ctx)(struct decompress_io_ctx *dic); void (*destroy_decompress_ctx)(struct decompress_io_ctx *dic); int (*decompress_pages)(struct decompress_io_ctx *dic); bool (*is_level_valid)(int level); }; static unsigned int offset_in_cluster(struct compress_ctx *cc, pgoff_t index) { return index & (cc->cluster_size - 1); } static pgoff_t cluster_idx(struct compress_ctx *cc, pgoff_t index) { return index >> cc->log_cluster_size; } static pgoff_t start_idx_of_cluster(struct compress_ctx *cc) { return cc->cluster_idx << cc->log_cluster_size; } bool f2fs_is_compressed_page(struct page *page) { if (!PagePrivate(page)) return false; if (!page_private(page)) return false; if (page_private_nonpointer(page)) return false; f2fs_bug_on(F2FS_M_SB(page->mapping), *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC); return true; } static void f2fs_set_compressed_page(struct page *page, struct inode *inode, pgoff_t index, void *data) { attach_page_private(page, (void *)data); /* i_crypto_info and iv index */ page->index = index; page->mapping = inode->i_mapping; } static void f2fs_drop_rpages(struct compress_ctx *cc, int len, bool unlock) { int i; for (i = 0; i < len; i++) { if (!cc->rpages[i]) continue; if (unlock) unlock_page(cc->rpages[i]); else put_page(cc->rpages[i]); } } static void f2fs_put_rpages(struct compress_ctx *cc) { f2fs_drop_rpages(cc, cc->cluster_size, false); } static void f2fs_unlock_rpages(struct compress_ctx *cc, int len) { f2fs_drop_rpages(cc, len, true); } static void f2fs_put_rpages_wbc(struct compress_ctx *cc, struct writeback_control *wbc, bool redirty, int unlock) { unsigned int i; for (i = 0; i < cc->cluster_size; i++) { if (!cc->rpages[i]) continue; if (redirty) redirty_page_for_writepage(wbc, cc->rpages[i]); f2fs_put_page(cc->rpages[i], unlock); } } struct page *f2fs_compress_control_page(struct page *page) { return ((struct compress_io_ctx *)page_private(page))->rpages[0]; } int f2fs_init_compress_ctx(struct compress_ctx *cc) { if (cc->rpages) return 0; cc->rpages = page_array_alloc(cc->inode, cc->cluster_size); return cc->rpages ? 0 : -ENOMEM; } void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) { page_array_free(cc->inode, cc->rpages, cc->cluster_size); cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; cc->valid_nr_cpages = 0; if (!reuse) cc->cluster_idx = NULL_CLUSTER; } void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page) { unsigned int cluster_ofs; if (!f2fs_cluster_can_merge_page(cc, page->index)) f2fs_bug_on(F2FS_I_SB(cc->inode), 1); cluster_ofs = offset_in_cluster(cc, page->index); cc->rpages[cluster_ofs] = page; cc->nr_rpages++; cc->cluster_idx = cluster_idx(cc, page->index); } #ifdef CONFIG_F2FS_FS_LZO static int lzo_init_compress_ctx(struct compress_ctx *cc) { cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), LZO1X_MEM_COMPRESS, GFP_NOFS); if (!cc->private) return -ENOMEM; cc->clen = lzo1x_worst_compress(PAGE_SIZE << cc->log_cluster_size); return 0; } static void lzo_destroy_compress_ctx(struct compress_ctx *cc) { kvfree(cc->private); cc->private = NULL; } static int lzo_compress_pages(struct compress_ctx *cc) { int ret; ret = lzo1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata, &cc->clen, cc->private); if (ret != LZO_E_OK) { printk_ratelimited("%sF2FS-fs (%s): lzo compress failed, ret:%d\n", KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret); return -EIO; } return 0; } static int lzo_decompress_pages(struct decompress_io_ctx *dic) { int ret; ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen, dic->rbuf, &dic->rlen); if (ret != LZO_E_OK) { printk_ratelimited("%sF2FS-fs (%s): lzo decompress failed, ret:%d\n", KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret); return -EIO; } if (dic->rlen != PAGE_SIZE << dic->log_cluster_size) { printk_ratelimited("%sF2FS-fs (%s): lzo invalid rlen:%zu, " "expected:%lu\n", KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, dic->rlen, PAGE_SIZE << dic->log_cluster_size); return -EIO; } return 0; } static const struct f2fs_compress_ops f2fs_lzo_ops = { .init_compress_ctx = lzo_init_compress_ctx, .destroy_compress_ctx = lzo_destroy_compress_ctx, .compress_pages = lzo_compress_pages, .decompress_pages = lzo_decompress_pages, }; #endif #ifdef CONFIG_F2FS_FS_LZ4 static int lz4_init_compress_ctx(struct compress_ctx *cc) { unsigned int size = LZ4_MEM_COMPRESS; #ifdef CONFIG_F2FS_FS_LZ4HC if (F2FS_I(cc->inode)->i_compress_level) size = LZ4HC_MEM_COMPRESS; #endif cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), size, GFP_NOFS); if (!cc->private) return -ENOMEM; /* * we do not change cc->clen to LZ4_compressBound(inputsize) to * adapt worst compress case, because lz4 compressor can handle * output budget properly. */ cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE; return 0; } static void lz4_destroy_compress_ctx(struct compress_ctx *cc) { kvfree(cc->private); cc->private = NULL; } static int lz4_compress_pages(struct compress_ctx *cc) { int len = -EINVAL; unsigned char level = F2FS_I(cc->inode)->i_compress_level; if (!level) len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, cc->clen, cc->private); #ifdef CONFIG_F2FS_FS_LZ4HC else len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen, cc->clen, level, cc->private); #endif if (len < 0) return len; if (!len) return -EAGAIN; cc->clen = len; return 0; } static int lz4_decompress_pages(struct decompress_io_ctx *dic) { int ret; ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf, dic->clen, dic->rlen); if (ret < 0) { printk_ratelimited("%sF2FS-fs (%s): lz4 decompress failed, ret:%d\n", KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret); return -EIO; } if (ret != PAGE_SIZE << dic->log_cluster_size) { printk_ratelimited("%sF2FS-fs (%s): lz4 invalid ret:%d, " "expected:%lu\n", KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret, PAGE_SIZE << dic->log_cluster_size); return -EIO; } return 0; } static bool lz4_is_level_valid(int lvl) { #ifdef CONFIG_F2FS_FS_LZ4HC return !lvl || (lvl >= LZ4HC_MIN_CLEVEL && lvl <= LZ4HC_MAX_CLEVEL); #else return lvl == 0; #endif } static const struct f2fs_compress_ops f2fs_lz4_ops = { .init_compress_ctx = lz4_init_compress_ctx, .destroy_compress_ctx = lz4_destroy_compress_ctx, .compress_pages = lz4_compress_pages, .decompress_pages = lz4_decompress_pages, .is_level_valid = lz4_is_level_valid, }; #endif #ifdef CONFIG_F2FS_FS_ZSTD static int zstd_init_compress_ctx(struct compress_ctx *cc) { zstd_parameters params; zstd_cstream *stream; void *workspace; unsigned int workspace_size; unsigned char level = F2FS_I(cc->inode)->i_compress_level; /* Need to remain this for backward compatibility */ if (!level) level = F2FS_ZSTD_DEFAULT_CLEVEL; params = zstd_get_params(level, cc->rlen); workspace_size = zstd_cstream_workspace_bound(&params.cParams); workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode), workspace_size, GFP_NOFS); if (!workspace) return -ENOMEM; stream = zstd_init_cstream(&params, 0, workspace, workspace_size); if (!stream) { printk_ratelimited("%sF2FS-fs (%s): %s zstd_init_cstream failed\n", KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, __func__); kvfree(workspace); return -EIO; } cc->private = workspace; cc->private2 = stream; cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE; return 0; } static void zstd_destroy_compress_ctx(struct compress_ctx *cc) { kvfree(cc->private); cc->private = NULL; cc->private2 = NULL; } static int zstd_compress_pages(struct compress_ctx *cc) { zstd_cstream *stream = cc->private2; zstd_in_buffer inbuf; zstd_out_buffer outbuf; int src_size = cc->rlen; int dst_size = src_size - PAGE_SIZE - COMPRESS_HEADER_SIZE; int ret; inbuf.pos = 0; inbuf.src = cc->rbuf; inbuf.size = src_size; outbuf.pos = 0; outbuf.dst = cc->cbuf->cdata; outbuf.size = dst_size; ret = zstd_compress_stream(stream, &outbuf, &inbuf); if (zstd_is_error(ret)) { printk_ratelimited("%sF2FS-fs (%s): %s zstd_compress_stream failed, ret: %d\n", KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, __func__, zstd_get_error_code(ret)); return -EIO; } ret = zstd_end_stream(stream, &outbuf); if (zstd_is_error(ret)) { printk_ratelimited("%sF2FS-fs (%s): %s zstd_end_stream returned %d\n", KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, __func__, zstd_get_error_code(ret)); return -EIO; } /* * there is compressed data remained in intermediate buffer due to * no more space in cbuf.cdata */ if (ret) return -EAGAIN; cc->clen = outbuf.pos; return 0; } static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) { zstd_dstream *stream; void *workspace; unsigned int workspace_size; unsigned int max_window_size = MAX_COMPRESS_WINDOW_SIZE(dic->log_cluster_size); workspace_size = zstd_dstream_workspace_bound(max_window_size); workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode), workspace_size, GFP_NOFS); if (!workspace) return -ENOMEM; stream = zstd_init_dstream(max_window_size, workspace, workspace_size); if (!stream) { printk_ratelimited("%sF2FS-fs (%s): %s zstd_init_dstream failed\n", KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, __func__); kvfree(workspace); return -EIO; } dic->private = workspace; dic->private2 = stream; return 0; } static void zstd_destroy_decompress_ctx(struct decompress_io_ctx *dic) { kvfree(dic->private); dic->private = NULL; dic->private2 = NULL; } static int zstd_decompress_pages(struct decompress_io_ctx *dic) { zstd_dstream *stream = dic->private2; zstd_in_buffer inbuf; zstd_out_buffer outbuf; int ret; inbuf.pos = 0; inbuf.src = dic->cbuf->cdata; inbuf.size = dic->clen; outbuf.pos = 0; outbuf.dst = dic->rbuf; outbuf.size = dic->rlen; ret = zstd_decompress_stream(stream, &outbuf, &inbuf); if (zstd_is_error(ret)) { printk_ratelimited("%sF2FS-fs (%s): %s zstd_decompress_stream failed, ret: %d\n", KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, __func__, zstd_get_error_code(ret)); return -EIO; } if (dic->rlen != outbuf.pos) { printk_ratelimited("%sF2FS-fs (%s): %s ZSTD invalid rlen:%zu, " "expected:%lu\n", KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, __func__, dic->rlen, PAGE_SIZE << dic->log_cluster_size); return -EIO; } return 0; } static bool zstd_is_level_valid(int lvl) { return lvl >= zstd_min_clevel() && lvl <= zstd_max_clevel(); } static const struct f2fs_compress_ops f2fs_zstd_ops = { .init_compress_ctx = zstd_init_compress_ctx, .destroy_compress_ctx = zstd_destroy_compress_ctx, .compress_pages = zstd_compress_pages, .init_decompress_ctx = zstd_init_decompress_ctx, .destroy_decompress_ctx = zstd_destroy_decompress_ctx, .decompress_pages = zstd_decompress_pages, .is_level_valid = zstd_is_level_valid, }; #endif #ifdef CONFIG_F2FS_FS_LZO #ifdef CONFIG_F2FS_FS_LZORLE static int lzorle_compress_pages(struct compress_ctx *cc) { int ret; ret = lzorle1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata, &cc->clen, cc->private); if (ret != LZO_E_OK) { printk_ratelimited("%sF2FS-fs (%s): lzo-rle compress failed, ret:%d\n", KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret); return -EIO; } return 0; } static const struct f2fs_compress_ops f2fs_lzorle_ops = { .init_compress_ctx = lzo_init_compress_ctx, .destroy_compress_ctx = lzo_destroy_compress_ctx, .compress_pages = lzorle_compress_pages, .decompress_pages = lzo_decompress_pages, }; #endif #endif static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = { #ifdef CONFIG_F2FS_FS_LZO &f2fs_lzo_ops, #else NULL, #endif #ifdef CONFIG_F2FS_FS_LZ4 &f2fs_lz4_ops, #else NULL, #endif #ifdef CONFIG_F2FS_FS_ZSTD &f2fs_zstd_ops, #else NULL, #endif #if defined(CONFIG_F2FS_FS_LZO) && defined(CONFIG_F2FS_FS_LZORLE) &f2fs_lzorle_ops, #else NULL, #endif }; bool f2fs_is_compress_backend_ready(struct inode *inode) { if (!f2fs_compressed_file(inode)) return true; return f2fs_cops[F2FS_I(inode)->i_compress_algorithm]; } bool f2fs_is_compress_level_valid(int alg, int lvl) { const struct f2fs_compress_ops *cops = f2fs_cops[alg]; if (cops->is_level_valid) return cops->is_level_valid(lvl); return lvl == 0; } static mempool_t *compress_page_pool; static int num_compress_pages = 512; module_param(num_compress_pages, uint, 0444); MODULE_PARM_DESC(num_compress_pages, "Number of intermediate compress pages to preallocate"); int __init f2fs_init_compress_mempool(void) { compress_page_pool = mempool_create_page_pool(num_compress_pages, 0); return compress_page_pool ? 0 : -ENOMEM; } void f2fs_destroy_compress_mempool(void) { mempool_destroy(compress_page_pool); } static struct page *f2fs_compress_alloc_page(void) { struct page *page; page = mempool_alloc(compress_page_pool, GFP_NOFS); lock_page(page); return page; } static void f2fs_compress_free_page(struct page *page) { if (!page) return; detach_page_private(page); page->mapping = NULL; unlock_page(page); mempool_free(page, compress_page_pool); } #define MAX_VMAP_RETRIES 3 static void *f2fs_vmap(struct page **pages, unsigned int count) { int i; void *buf = NULL; for (i = 0; i < MAX_VMAP_RETRIES; i++) { buf = vm_map_ram(pages, count, -1); if (buf) break; vm_unmap_aliases(); } return buf; } static int f2fs_compress_pages(struct compress_ctx *cc) { struct f2fs_inode_info *fi = F2FS_I(cc->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; unsigned int max_len, new_nr_cpages; u32 chksum = 0; int i, ret; trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx, cc->cluster_size, fi->i_compress_algorithm); if (cops->init_compress_ctx) { ret = cops->init_compress_ctx(cc); if (ret) goto out; } max_len = COMPRESS_HEADER_SIZE + cc->clen; cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); cc->valid_nr_cpages = cc->nr_cpages; cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages); if (!cc->cpages) { ret = -ENOMEM; goto destroy_compress_ctx; } for (i = 0; i < cc->nr_cpages; i++) cc->cpages[i] = f2fs_compress_alloc_page(); cc->rbuf = f2fs_vmap(cc->rpages, cc->cluster_size); if (!cc->rbuf) { ret = -ENOMEM; goto out_free_cpages; } cc->cbuf = f2fs_vmap(cc->cpages, cc->nr_cpages); if (!cc->cbuf) { ret = -ENOMEM; goto out_vunmap_rbuf; } ret = cops->compress_pages(cc); if (ret) goto out_vunmap_cbuf; max_len = PAGE_SIZE * (cc->cluster_size - 1) - COMPRESS_HEADER_SIZE; if (cc->clen > max_len) { ret = -EAGAIN; goto out_vunmap_cbuf; } cc->cbuf->clen = cpu_to_le32(cc->clen); if (fi->i_compress_flag & BIT(COMPRESS_CHKSUM)) chksum = f2fs_crc32(F2FS_I_SB(cc->inode), cc->cbuf->cdata, cc->clen); cc->cbuf->chksum = cpu_to_le32(chksum); for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++) cc->cbuf->reserved[i] = cpu_to_le32(0); new_nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); /* zero out any unused part of the last page */ memset(&cc->cbuf->cdata[cc->clen], 0, (new_nr_cpages * PAGE_SIZE) - (cc->clen + COMPRESS_HEADER_SIZE)); vm_unmap_ram(cc->cbuf, cc->nr_cpages); vm_unmap_ram(cc->rbuf, cc->cluster_size); for (i = new_nr_cpages; i < cc->nr_cpages; i++) { f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } if (cops->destroy_compress_ctx) cops->destroy_compress_ctx(cc); cc->valid_nr_cpages = new_nr_cpages; trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, cc->clen, ret); return 0; out_vunmap_cbuf: vm_unmap_ram(cc->cbuf, cc->nr_cpages); out_vunmap_rbuf: vm_unmap_ram(cc->rbuf, cc->cluster_size); out_free_cpages: for (i = 0; i < cc->nr_cpages; i++) { if (cc->cpages[i]) f2fs_compress_free_page(cc->cpages[i]); } page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; destroy_compress_ctx: if (cops->destroy_compress_ctx) cops->destroy_compress_ctx(cc); out: trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, cc->clen, ret); return ret; } static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, bool pre_alloc); static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, bool bypass_destroy_callback, bool pre_alloc); void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) { struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; bool bypass_callback = false; int ret; trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, dic->cluster_size, fi->i_compress_algorithm); if (dic->failed) { ret = -EIO; goto out_end_io; } ret = f2fs_prepare_decomp_mem(dic, false); if (ret) { bypass_callback = true; goto out_release; } dic->clen = le32_to_cpu(dic->cbuf->clen); dic->rlen = PAGE_SIZE << dic->log_cluster_size; if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { ret = -EFSCORRUPTED; /* Avoid f2fs_commit_super in irq context */ if (!in_task) f2fs_handle_error_async(sbi, ERROR_FAIL_DECOMPRESSION); else f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); goto out_release; } ret = cops->decompress_pages(dic); if (!ret && (fi->i_compress_flag & BIT(COMPRESS_CHKSUM))) { u32 provided = le32_to_cpu(dic->cbuf->chksum); u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen); if (provided != calculated) { if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) { set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT); printk_ratelimited( "%sF2FS-fs (%s): checksum invalid, nid = %lu, %x vs %x", KERN_INFO, sbi->sb->s_id, dic->inode->i_ino, provided, calculated); } set_sbi_flag(sbi, SBI_NEED_FSCK); } } out_release: f2fs_release_decomp_mem(dic, bypass_callback, false); out_end_io: trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, dic->clen, ret); f2fs_decompress_end_io(dic, ret, in_task); } /* * This is called when a page of a compressed cluster has been read from disk * (or failed to be read from disk). It checks whether this page was the last * page being waited on in the cluster, and if so, it decompresses the cluster * (or in the case of a failure, cleans up without actually decompressing). */ void f2fs_end_read_compressed_page(struct page *page, bool failed, block_t blkaddr, bool in_task) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); dec_page_count(sbi, F2FS_RD_DATA); if (failed) WRITE_ONCE(dic->failed, true); else if (blkaddr && in_task) f2fs_cache_compressed_page(sbi, page, dic->inode->i_ino, blkaddr); if (atomic_dec_and_test(&dic->remaining_pages)) f2fs_decompress_cluster(dic, in_task); } static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) { if (cc->cluster_idx == NULL_CLUSTER) return true; return cc->cluster_idx == cluster_idx(cc, index); } bool f2fs_cluster_is_empty(struct compress_ctx *cc) { return cc->nr_rpages == 0; } static bool f2fs_cluster_is_full(struct compress_ctx *cc) { return cc->cluster_size == cc->nr_rpages; } bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) { if (f2fs_cluster_is_empty(cc)) return true; return is_page_in_cluster(cc, index); } bool f2fs_all_cluster_page_ready(struct compress_ctx *cc, struct page **pages, int index, int nr_pages, bool uptodate) { unsigned long pgidx = pages[index]->index; int i = uptodate ? 0 : 1; /* * when uptodate set to true, try to check all pages in cluster is * uptodate or not. */ if (uptodate && (pgidx % cc->cluster_size)) return false; if (nr_pages - index < cc->cluster_size) return false; for (; i < cc->cluster_size; i++) { if (pages[index + i]->index != pgidx + i) return false; if (uptodate && !PageUptodate(pages[index + i])) return false; } return true; } static bool cluster_has_invalid_data(struct compress_ctx *cc) { loff_t i_size = i_size_read(cc->inode); unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE); int i; for (i = 0; i < cc->cluster_size; i++) { struct page *page = cc->rpages[i]; f2fs_bug_on(F2FS_I_SB(cc->inode), !page); /* beyond EOF */ if (page->index >= nr_pages) return true; } return false; } bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { #ifdef CONFIG_F2FS_CHECK_FS struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size; int cluster_end = 0; unsigned int count; int i; char *reason = ""; if (dn->data_blkaddr != COMPRESS_ADDR) return false; /* [..., COMPR_ADDR, ...] */ if (dn->ofs_in_node % cluster_size) { reason = "[*|C|*|*]"; goto out; } for (i = 1, count = 1; i < cluster_size; i++, count++) { block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node + i); /* [COMPR_ADDR, ..., COMPR_ADDR] */ if (blkaddr == COMPRESS_ADDR) { reason = "[C|*|C|*]"; goto out; } if (!__is_valid_data_blkaddr(blkaddr)) { if (!cluster_end) cluster_end = i; continue; } /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ if (cluster_end) { reason = "[C|N|N|V]"; goto out; } } f2fs_bug_on(F2FS_I_SB(dn->inode), count != cluster_size && !is_inode_flag_set(dn->inode, FI_COMPRESS_RELEASED)); return false; out: f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s", dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason); set_sbi_flag(sbi, SBI_NEED_FSCK); return true; #else return false; #endif } static int __f2fs_get_cluster_blocks(struct inode *inode, struct dnode_of_data *dn) { unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; int count, i; for (i = 1, count = 1; i < cluster_size; i++) { block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node + i); if (__is_valid_data_blkaddr(blkaddr)) count++; } return count; } static int __f2fs_cluster_blocks(struct inode *inode, unsigned int cluster_idx, bool compr_blks) { struct dnode_of_data dn; unsigned int start_idx = cluster_idx << F2FS_I(inode)->i_log_cluster_size; int ret; set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (ret) { if (ret == -ENOENT) ret = 0; goto fail; } if (f2fs_sanity_check_cluster(&dn)) { ret = -EFSCORRUPTED; goto fail; } if (dn.data_blkaddr == COMPRESS_ADDR) { if (compr_blks) ret = __f2fs_get_cluster_blocks(inode, &dn); else ret = 1; } fail: f2fs_put_dnode(&dn); return ret; } /* return # of compressed blocks in compressed cluster */ static int f2fs_compressed_blocks(struct compress_ctx *cc) { return __f2fs_cluster_blocks(cc->inode, cc->cluster_idx, true); } /* return whether cluster is compressed one or not */ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) { return __f2fs_cluster_blocks(inode, index >> F2FS_I(inode)->i_log_cluster_size, false); } static bool cluster_may_compress(struct compress_ctx *cc) { if (!f2fs_need_compress_data(cc->inode)) return false; if (f2fs_is_atomic_file(cc->inode)) return false; if (!f2fs_cluster_is_full(cc)) return false; if (unlikely(f2fs_cp_error(F2FS_I_SB(cc->inode)))) return false; return !cluster_has_invalid_data(cc); } static void set_cluster_writeback(struct compress_ctx *cc) { int i; for (i = 0; i < cc->cluster_size; i++) { if (cc->rpages[i]) set_page_writeback(cc->rpages[i]); } } static void set_cluster_dirty(struct compress_ctx *cc) { int i; for (i = 0; i < cc->cluster_size; i++) if (cc->rpages[i]) { set_page_dirty(cc->rpages[i]); set_page_private_gcing(cc->rpages[i]); } } static int prepare_compress_overwrite(struct compress_ctx *cc, struct page **pagep, pgoff_t index, void **fsdata) { struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct address_space *mapping = cc->inode->i_mapping; struct page *page; sector_t last_block_in_bio; fgf_t fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; pgoff_t start_idx = start_idx_of_cluster(cc); int i, ret; retry: ret = f2fs_is_compressed_cluster(cc->inode, start_idx); if (ret <= 0) return ret; ret = f2fs_init_compress_ctx(cc); if (ret) return ret; /* keep page reference to avoid page reclaim */ for (i = 0; i < cc->cluster_size; i++) { page = f2fs_pagecache_get_page(mapping, start_idx + i, fgp_flag, GFP_NOFS); if (!page) { ret = -ENOMEM; goto unlock_pages; } if (PageUptodate(page)) f2fs_put_page(page, 1); else f2fs_compress_ctx_add_page(cc, page); } if (!f2fs_cluster_is_empty(cc)) { struct bio *bio = NULL; ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, &last_block_in_bio, false, true); f2fs_put_rpages(cc); f2fs_destroy_compress_ctx(cc, true); if (ret) goto out; if (bio) f2fs_submit_read_bio(sbi, bio, DATA); ret = f2fs_init_compress_ctx(cc); if (ret) goto out; } for (i = 0; i < cc->cluster_size; i++) { f2fs_bug_on(sbi, cc->rpages[i]); page = find_lock_page(mapping, start_idx + i); if (!page) { /* page can be truncated */ goto release_and_retry; } f2fs_wait_on_page_writeback(page, DATA, true, true); f2fs_compress_ctx_add_page(cc, page); if (!PageUptodate(page)) { release_and_retry: f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i + 1); f2fs_destroy_compress_ctx(cc, true); goto retry; } } if (likely(!ret)) { *fsdata = cc->rpages; *pagep = cc->rpages[offset_in_cluster(cc, index)]; return cc->cluster_size; } unlock_pages: f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i); f2fs_destroy_compress_ctx(cc, true); out: return ret; } int f2fs_prepare_compress_overwrite(struct inode *inode, struct page **pagep, pgoff_t index, void **fsdata) { struct compress_ctx cc = { .inode = inode, .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, .cluster_size = F2FS_I(inode)->i_cluster_size, .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, .rpages = NULL, .nr_rpages = 0, }; return prepare_compress_overwrite(&cc, pagep, index, fsdata); } bool f2fs_compress_write_end(struct inode *inode, void *fsdata, pgoff_t index, unsigned copied) { struct compress_ctx cc = { .inode = inode, .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, .cluster_size = F2FS_I(inode)->i_cluster_size, .rpages = fsdata, }; bool first_index = (index == cc.rpages[0]->index); if (copied) set_cluster_dirty(&cc); f2fs_put_rpages_wbc(&cc, NULL, false, 1); f2fs_destroy_compress_ctx(&cc, false); return first_index; } int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) { void *fsdata = NULL; struct page *pagep; int log_cluster_size = F2FS_I(inode)->i_log_cluster_size; pgoff_t start_idx = from >> (PAGE_SHIFT + log_cluster_size) << log_cluster_size; int err; err = f2fs_is_compressed_cluster(inode, start_idx); if (err < 0) return err; /* truncate normal cluster */ if (!err) return f2fs_do_truncate_blocks(inode, from, lock); /* truncate compressed cluster */ err = f2fs_prepare_compress_overwrite(inode, &pagep, start_idx, &fsdata); /* should not be a normal cluster */ f2fs_bug_on(F2FS_I_SB(inode), err == 0); if (err <= 0) return err; if (err > 0) { struct page **rpages = fsdata; int cluster_size = F2FS_I(inode)->i_cluster_size; int i; for (i = cluster_size - 1; i >= 0; i--) { loff_t start = rpages[i]->index << PAGE_SHIFT; if (from <= start) { zero_user_segment(rpages[i], 0, PAGE_SIZE); } else { zero_user_segment(rpages[i], from - start, PAGE_SIZE); break; } } f2fs_compress_write_end(inode, fsdata, start_idx, true); } return 0; } static int f2fs_write_compressed_pages(struct compress_ctx *cc, int *submitted, struct writeback_control *wbc, enum iostat_type io_type) { struct inode *inode = cc->inode; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_io_info fio = { .sbi = sbi, .ino = cc->inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), .old_blkaddr = NEW_ADDR, .page = NULL, .encrypted_page = NULL, .compressed_page = NULL, .submitted = 0, .io_type = io_type, .io_wbc = wbc, .encrypted = fscrypt_inode_uses_fs_layer_crypto(cc->inode) ? 1 : 0, }; struct dnode_of_data dn; struct node_info ni; struct compress_io_ctx *cic; pgoff_t start_idx = start_idx_of_cluster(cc); unsigned int last_index = cc->cluster_size - 1; loff_t psize; int i, err; bool quota_inode = IS_NOQUOTA(inode); /* we should bypass data pages to proceed the kworker jobs */ if (unlikely(f2fs_cp_error(sbi))) { mapping_set_error(cc->rpages[0]->mapping, -EIO); goto out_free; } if (quota_inode) { /* * We need to wait for node_write to avoid block allocation during * checkpoint. This can only happen to quota writes which can cause * the below discard race condition. */ f2fs_down_read(&sbi->node_write); } else if (!f2fs_trylock_op(sbi)) { goto out_free; } set_new_dnode(&dn, cc->inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (err) goto out_unlock_op; for (i = 0; i < cc->cluster_size; i++) { if (data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i) == NULL_ADDR) goto out_put_dnode; } psize = (loff_t)(cc->rpages[last_index]->index + 1) << PAGE_SHIFT; err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) goto out_put_dnode; fio.version = ni.version; cic = f2fs_kmem_cache_alloc(cic_entry_slab, GFP_F2FS_ZERO, false, sbi); if (!cic) goto out_put_dnode; cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; atomic_set(&cic->pending_pages, cc->valid_nr_cpages); cic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!cic->rpages) goto out_put_cic; cic->nr_rpages = cc->cluster_size; for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_set_compressed_page(cc->cpages[i], inode, cc->rpages[i + 1]->index, cic); fio.compressed_page = cc->cpages[i]; fio.old_blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, fio.old_blkaddr); if (fio.encrypted) { fio.page = cc->rpages[i + 1]; err = f2fs_encrypt_one_page(&fio); if (err) goto out_destroy_crypt; cc->cpages[i] = fio.encrypted_page; } } set_cluster_writeback(cc); for (i = 0; i < cc->cluster_size; i++) cic->rpages[i] = cc->rpages[i]; for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) { block_t blkaddr; blkaddr = f2fs_data_blkaddr(&dn); fio.page = cc->rpages[i]; fio.old_blkaddr = blkaddr; /* cluster header */ if (i == 0) { if (blkaddr == COMPRESS_ADDR) fio.compr_blocks++; if (__is_valid_data_blkaddr(blkaddr)) f2fs_invalidate_blocks(sbi, blkaddr); f2fs_update_data_blkaddr(&dn, COMPRESS_ADDR); goto unlock_continue; } if (fio.compr_blocks && __is_valid_data_blkaddr(blkaddr)) fio.compr_blocks++; if (i > cc->valid_nr_cpages) { if (__is_valid_data_blkaddr(blkaddr)) { f2fs_invalidate_blocks(sbi, blkaddr); f2fs_update_data_blkaddr(&dn, NEW_ADDR); } goto unlock_continue; } f2fs_bug_on(fio.sbi, blkaddr == NULL_ADDR); if (fio.encrypted) fio.encrypted_page = cc->cpages[i - 1]; else fio.compressed_page = cc->cpages[i - 1]; cc->cpages[i - 1] = NULL; f2fs_outplace_write_data(&dn, &fio); (*submitted)++; unlock_continue: inode_dec_dirty_pages(cc->inode); unlock_page(fio.page); } if (fio.compr_blocks) f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); f2fs_i_compr_blocks_update(inode, cc->valid_nr_cpages, true); add_compr_block_stat(inode, cc->valid_nr_cpages); set_inode_flag(cc->inode, FI_APPEND_WRITE); f2fs_put_dnode(&dn); if (quota_inode) f2fs_up_read(&sbi->node_write); else f2fs_unlock_op(sbi); spin_lock(&fi->i_size_lock); if (fi->last_disk_size < psize) fi->last_disk_size = psize; spin_unlock(&fi->i_size_lock); f2fs_put_rpages(cc); page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; f2fs_destroy_compress_ctx(cc, false); return 0; out_destroy_crypt: page_array_free(cc->inode, cic->rpages, cc->cluster_size); for (--i; i >= 0; i--) fscrypt_finalize_bounce_page(&cc->cpages[i]); out_put_cic: kmem_cache_free(cic_entry_slab, cic); out_put_dnode: f2fs_put_dnode(&dn); out_unlock_op: if (quota_inode) f2fs_up_read(&sbi->node_write); else f2fs_unlock_op(sbi); out_free: for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; return -EAGAIN; } void f2fs_compress_write_end_io(struct bio *bio, struct page *page) { struct f2fs_sb_info *sbi = bio->bi_private; struct compress_io_ctx *cic = (struct compress_io_ctx *)page_private(page); int i; if (unlikely(bio->bi_status)) mapping_set_error(cic->inode->i_mapping, -EIO); f2fs_compress_free_page(page); dec_page_count(sbi, F2FS_WB_DATA); if (atomic_dec_return(&cic->pending_pages)) return; for (i = 0; i < cic->nr_rpages; i++) { WARN_ON(!cic->rpages[i]); clear_page_private_gcing(cic->rpages[i]); end_page_writeback(cic->rpages[i]); } page_array_free(cic->inode, cic->rpages, cic->nr_rpages); kmem_cache_free(cic_entry_slab, cic); } static int f2fs_write_raw_pages(struct compress_ctx *cc, int *submitted, struct writeback_control *wbc, enum iostat_type io_type) { struct address_space *mapping = cc->inode->i_mapping; int _submitted, compr_blocks, ret, i; compr_blocks = f2fs_compressed_blocks(cc); for (i = 0; i < cc->cluster_size; i++) { if (!cc->rpages[i]) continue; redirty_page_for_writepage(wbc, cc->rpages[i]); unlock_page(cc->rpages[i]); } if (compr_blocks < 0) return compr_blocks; for (i = 0; i < cc->cluster_size; i++) { if (!cc->rpages[i]) continue; retry_write: lock_page(cc->rpages[i]); if (cc->rpages[i]->mapping != mapping) { continue_unlock: unlock_page(cc->rpages[i]); continue; } if (!PageDirty(cc->rpages[i])) goto continue_unlock; if (PageWriteback(cc->rpages[i])) { if (wbc->sync_mode == WB_SYNC_NONE) goto continue_unlock; f2fs_wait_on_page_writeback(cc->rpages[i], DATA, true, true); } if (!clear_page_dirty_for_io(cc->rpages[i])) goto continue_unlock; ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, NULL, NULL, wbc, io_type, compr_blocks, false); if (ret) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(cc->rpages[i]); ret = 0; } else if (ret == -EAGAIN) { /* * for quota file, just redirty left pages to * avoid deadlock caused by cluster update race * from foreground operation. */ if (IS_NOQUOTA(cc->inode)) return 0; ret = 0; f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry_write; } return ret; } *submitted += _submitted; } f2fs_balance_fs(F2FS_M_SB(mapping), true); return 0; } int f2fs_write_multi_pages(struct compress_ctx *cc, int *submitted, struct writeback_control *wbc, enum iostat_type io_type) { int err; *submitted = 0; if (cluster_may_compress(cc)) { err = f2fs_compress_pages(cc); if (err == -EAGAIN) { add_compr_block_stat(cc->inode, cc->cluster_size); goto write; } else if (err) { f2fs_put_rpages_wbc(cc, wbc, true, 1); goto destroy_out; } err = f2fs_write_compressed_pages(cc, submitted, wbc, io_type); if (!err) return 0; f2fs_bug_on(F2FS_I_SB(cc->inode), err != -EAGAIN); } write: f2fs_bug_on(F2FS_I_SB(cc->inode), *submitted); err = f2fs_write_raw_pages(cc, submitted, wbc, io_type); f2fs_put_rpages_wbc(cc, wbc, false, 0); destroy_out: f2fs_destroy_compress_ctx(cc, false); return err; } static inline bool allow_memalloc_for_decomp(struct f2fs_sb_info *sbi, bool pre_alloc) { return pre_alloc ^ f2fs_low_mem_mode(sbi); } static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, bool pre_alloc) { const struct f2fs_compress_ops *cops = f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; int i; if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) return 0; dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); if (!dic->tpages) return -ENOMEM; for (i = 0; i < dic->cluster_size; i++) { if (dic->rpages[i]) { dic->tpages[i] = dic->rpages[i]; continue; } dic->tpages[i] = f2fs_compress_alloc_page(); } dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); if (!dic->rbuf) return -ENOMEM; dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); if (!dic->cbuf) return -ENOMEM; if (cops->init_decompress_ctx) return cops->init_decompress_ctx(dic); return 0; } static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, bool bypass_destroy_callback, bool pre_alloc) { const struct f2fs_compress_ops *cops = f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc)) return; if (!bypass_destroy_callback && cops->destroy_decompress_ctx) cops->destroy_decompress_ctx(dic); if (dic->cbuf) vm_unmap_ram(dic->cbuf, dic->nr_cpages); if (dic->rbuf) vm_unmap_ram(dic->rbuf, dic->cluster_size); } static void f2fs_free_dic(struct decompress_io_ctx *dic, bool bypass_destroy_callback); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { struct decompress_io_ctx *dic; pgoff_t start_idx = start_idx_of_cluster(cc); struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); int i, ret; dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, false, sbi); if (!dic) return ERR_PTR(-ENOMEM); dic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!dic->rpages) { kmem_cache_free(dic_entry_slab, dic); return ERR_PTR(-ENOMEM); } dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; atomic_set(&dic->remaining_pages, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; dic->log_cluster_size = cc->log_cluster_size; dic->nr_cpages = cc->nr_cpages; refcount_set(&dic->refcnt, 1); dic->failed = false; dic->need_verity = f2fs_need_verity(cc->inode, start_idx); for (i = 0; i < dic->cluster_size; i++) dic->rpages[i] = cc->rpages[i]; dic->nr_rpages = cc->cluster_size; dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages); if (!dic->cpages) { ret = -ENOMEM; goto out_free; } for (i = 0; i < dic->nr_cpages; i++) { struct page *page; page = f2fs_compress_alloc_page(); f2fs_set_compressed_page(page, cc->inode, start_idx + i + 1, dic); dic->cpages[i] = page; } ret = f2fs_prepare_decomp_mem(dic, true); if (ret) goto out_free; return dic; out_free: f2fs_free_dic(dic, true); return ERR_PTR(ret); } static void f2fs_free_dic(struct decompress_io_ctx *dic, bool bypass_destroy_callback) { int i; f2fs_release_decomp_mem(dic, bypass_destroy_callback, true); if (dic->tpages) { for (i = 0; i < dic->cluster_size; i++) { if (dic->rpages[i]) continue; if (!dic->tpages[i]) continue; f2fs_compress_free_page(dic->tpages[i]); } page_array_free(dic->inode, dic->tpages, dic->cluster_size); } if (dic->cpages) { for (i = 0; i < dic->nr_cpages; i++) { if (!dic->cpages[i]) continue; f2fs_compress_free_page(dic->cpages[i]); } page_array_free(dic->inode, dic->cpages, dic->nr_cpages); } page_array_free(dic->inode, dic->rpages, dic->nr_rpages); kmem_cache_free(dic_entry_slab, dic); } static void f2fs_late_free_dic(struct work_struct *work) { struct decompress_io_ctx *dic = container_of(work, struct decompress_io_ctx, free_work); f2fs_free_dic(dic, false); } static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task) { if (refcount_dec_and_test(&dic->refcnt)) { if (in_task) { f2fs_free_dic(dic, false); } else { INIT_WORK(&dic->free_work, f2fs_late_free_dic); queue_work(F2FS_I_SB(dic->inode)->post_read_wq, &dic->free_work); } } } static void f2fs_verify_cluster(struct work_struct *work) { struct decompress_io_ctx *dic = container_of(work, struct decompress_io_ctx, verity_work); int i; /* Verify, update, and unlock the decompressed pages. */ for (i = 0; i < dic->cluster_size; i++) { struct page *rpage = dic->rpages[i]; if (!rpage) continue; if (fsverity_verify_page(rpage)) SetPageUptodate(rpage); else ClearPageUptodate(rpage); unlock_page(rpage); } f2fs_put_dic(dic, true); } /* * This is called when a compressed cluster has been decompressed * (or failed to be read and/or decompressed). */ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, bool in_task) { int i; if (!failed && dic->need_verity) { /* * Note that to avoid deadlocks, the verity work can't be done * on the decompression workqueue. This is because verifying * the data pages can involve reading metadata pages from the * file, and these metadata pages may be compressed. */ INIT_WORK(&dic->verity_work, f2fs_verify_cluster); fsverity_enqueue_verify_work(&dic->verity_work); return; } /* Update and unlock the cluster's pagecache pages. */ for (i = 0; i < dic->cluster_size; i++) { struct page *rpage = dic->rpages[i]; if (!rpage) continue; if (failed) ClearPageUptodate(rpage); else SetPageUptodate(rpage); unlock_page(rpage); } /* * Release the reference to the decompress_io_ctx that was being held * for I/O completion. */ f2fs_put_dic(dic, in_task); } /* * Put a reference to a compressed page's decompress_io_ctx. * * This is called when the page is no longer needed and can be freed. */ void f2fs_put_page_dic(struct page *page, bool in_task) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); f2fs_put_dic(dic, in_task); } /* * check whether cluster blocks are contiguous, and add extent cache entry * only if cluster blocks are logically and physically contiguous. */ unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) { bool compressed = f2fs_data_blkaddr(dn) == COMPRESS_ADDR; int i = compressed ? 1 : 0; block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node + i); for (i += 1; i < F2FS_I(dn->inode)->i_cluster_size; i++) { block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node + i); if (!__is_valid_data_blkaddr(blkaddr)) break; if (first_blkaddr + i - (compressed ? 1 : 0) != blkaddr) return 0; } return compressed ? i - 1 : i; } const struct address_space_operations f2fs_compress_aops = { .release_folio = f2fs_release_folio, .invalidate_folio = f2fs_invalidate_folio, .migrate_folio = filemap_migrate_folio, }; struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi) { return sbi->compress_inode->i_mapping; } void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr) { if (!sbi->compress_inode) return; invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr); } void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, nid_t ino, block_t blkaddr) { struct page *cpage; int ret; if (!test_opt(sbi, COMPRESS_CACHE)) return; if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) return; if (!f2fs_available_free_memory(sbi, COMPRESS_PAGE)) return; cpage = find_get_page(COMPRESS_MAPPING(sbi), blkaddr); if (cpage) { f2fs_put_page(cpage, 0); return; } cpage = alloc_page(__GFP_NOWARN | __GFP_IO); if (!cpage) return; ret = add_to_page_cache_lru(cpage, COMPRESS_MAPPING(sbi), blkaddr, GFP_NOFS); if (ret) { f2fs_put_page(cpage, 0); return; } set_page_private_data(cpage, ino); if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) goto out; memcpy(page_address(cpage), page_address(page), PAGE_SIZE); SetPageUptodate(cpage); out: f2fs_put_page(cpage, 1); } bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, block_t blkaddr) { struct page *cpage; bool hitted = false; if (!test_opt(sbi, COMPRESS_CACHE)) return false; cpage = f2fs_pagecache_get_page(COMPRESS_MAPPING(sbi), blkaddr, FGP_LOCK | FGP_NOWAIT, GFP_NOFS); if (cpage) { if (PageUptodate(cpage)) { atomic_inc(&sbi->compress_page_hit); memcpy(page_address(page), page_address(cpage), PAGE_SIZE); hitted = true; } f2fs_put_page(cpage, 1); } return hitted; } void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { struct address_space *mapping = COMPRESS_MAPPING(sbi); struct folio_batch fbatch; pgoff_t index = 0; pgoff_t end = MAX_BLKADDR(sbi); if (!mapping->nrpages) return; folio_batch_init(&fbatch); do { unsigned int nr, i; nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); if (!nr) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; folio_lock(folio); if (folio->mapping != mapping) { folio_unlock(folio); continue; } if (ino != get_page_private_data(&folio->page)) { folio_unlock(folio); continue; } generic_error_remove_folio(mapping, folio); folio_unlock(folio); } folio_batch_release(&fbatch); cond_resched(); } while (index < end); } int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { struct inode *inode; if (!test_opt(sbi, COMPRESS_CACHE)) return 0; inode = f2fs_iget(sbi->sb, F2FS_COMPRESS_INO(sbi)); if (IS_ERR(inode)) return PTR_ERR(inode); sbi->compress_inode = inode; sbi->compress_percent = COMPRESS_PERCENT; sbi->compress_watermark = COMPRESS_WATERMARK; atomic_set(&sbi->compress_page_hit, 0); return 0; } void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { if (!sbi->compress_inode) return; iput(sbi->compress_inode); sbi->compress_inode = NULL; } int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; char slab_name[35]; if (!f2fs_sb_has_compression(sbi)) return 0; sprintf(slab_name, "f2fs_page_array_entry-%u:%u", MAJOR(dev), MINOR(dev)); sbi->page_array_slab_size = sizeof(struct page *) << F2FS_OPTION(sbi).compress_log_size; sbi->page_array_slab = f2fs_kmem_cache_create(slab_name, sbi->page_array_slab_size); return sbi->page_array_slab ? 0 : -ENOMEM; } void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { kmem_cache_destroy(sbi->page_array_slab); } int __init f2fs_init_compress_cache(void) { cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry", sizeof(struct compress_io_ctx)); if (!cic_entry_slab) return -ENOMEM; dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry", sizeof(struct decompress_io_ctx)); if (!dic_entry_slab) goto free_cic; return 0; free_cic: kmem_cache_destroy(cic_entry_slab); return -ENOMEM; } void f2fs_destroy_compress_cache(void) { kmem_cache_destroy(dic_entry_slab); kmem_cache_destroy(cic_entry_slab); }
354 19 7 12 12 12 4 2 1 1 5 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "../kernel/futex/futex.h" #include "io_uring.h" #include "rsrc.h" #include "futex.h" struct io_futex { struct file *file; union { u32 __user *uaddr; struct futex_waitv __user *uwaitv; }; unsigned long futex_val; unsigned long futex_mask; unsigned long futexv_owned; u32 futex_flags; unsigned int futex_nr; bool futexv_unqueued; }; struct io_futex_data { union { struct futex_q q; struct io_cache_entry cache; }; struct io_kiocb *req; }; void io_futex_cache_init(struct io_ring_ctx *ctx) { io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX, sizeof(struct io_futex_data)); } static void io_futex_cache_entry_free(struct io_cache_entry *entry) { kfree(container_of(entry, struct io_futex_data, cache)); } void io_futex_cache_free(struct io_ring_ctx *ctx) { io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free); } static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) { req->async_data = NULL; hlist_del_init(&req->hash_node); io_req_task_complete(req, ts); } static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) { struct io_futex_data *ifd = req->async_data; struct io_ring_ctx *ctx = req->ctx; io_tw_lock(ctx, ts); if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache)) kfree(ifd); __io_futex_complete(req, ts); } static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct futex_vector *futexv = req->async_data; io_tw_lock(req->ctx, ts); if (!iof->futexv_unqueued) { int res; res = futex_unqueue_multiple(futexv, iof->futex_nr); if (res != -1) io_req_set_res(req, res, 0); } kfree(req->async_data); req->flags &= ~REQ_F_ASYNC_DATA; __io_futex_complete(req, ts); } static bool io_futexv_claim(struct io_futex *iof) { if (test_bit(0, &iof->futexv_owned) || test_and_set_bit_lock(0, &iof->futexv_owned)) return false; return true; } static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) { /* futex wake already done or in progress */ if (req->opcode == IORING_OP_FUTEX_WAIT) { struct io_futex_data *ifd = req->async_data; if (!futex_unqueue(&ifd->q)) return false; req->io_task_work.func = io_futex_complete; } else { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); if (!io_futexv_claim(iof)) return false; req->io_task_work.func = io_futexv_complete; } hlist_del_init(&req->hash_node); io_req_set_res(req, -ECANCELED, 0); io_req_task_work_add(req); return true; } int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags) { struct hlist_node *tmp; struct io_kiocb *req; int nr = 0; if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) return -ENOENT; io_ring_submit_lock(ctx, issue_flags); hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { if (req->cqe.user_data != cd->data && !(cd->flags & IORING_ASYNC_CANCEL_ANY)) continue; if (__io_futex_cancel(ctx, req)) nr++; if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) break; } io_ring_submit_unlock(ctx, issue_flags); if (nr) return nr; return -ENOENT; } bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) { struct hlist_node *tmp; struct io_kiocb *req; bool found = false; lockdep_assert_held(&ctx->uring_lock); hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { if (!io_match_task_safe(req, task, cancel_all)) continue; __io_futex_cancel(ctx, req); found = true; } return found; } int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); u32 flags; if (unlikely(sqe->len || sqe->futex_flags || sqe->buf_index || sqe->file_index)) return -EINVAL; iof->uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); iof->futex_val = READ_ONCE(sqe->addr2); iof->futex_mask = READ_ONCE(sqe->addr3); flags = READ_ONCE(sqe->fd); if (flags & ~FUTEX2_VALID_MASK) return -EINVAL; iof->futex_flags = futex2_to_flags(flags); if (!futex_flags_valid(iof->futex_flags)) return -EINVAL; if (!futex_validate_input(iof->futex_flags, iof->futex_val) || !futex_validate_input(iof->futex_flags, iof->futex_mask)) return -EINVAL; return 0; } static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q) { struct io_kiocb *req = q->wake_data; struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); if (!io_futexv_claim(iof)) return; if (unlikely(!__futex_wake_mark(q))) return; io_req_set_res(req, 0, 0); req->io_task_work.func = io_futexv_complete; io_req_task_work_add(req); } int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct futex_vector *futexv; int ret; /* No flags or mask supported for waitv */ if (unlikely(sqe->fd || sqe->buf_index || sqe->file_index || sqe->addr2 || sqe->futex_flags || sqe->addr3)) return -EINVAL; iof->uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); iof->futex_nr = READ_ONCE(sqe->len); if (!iof->futex_nr || iof->futex_nr > FUTEX_WAITV_MAX) return -EINVAL; futexv = kcalloc(iof->futex_nr, sizeof(*futexv), GFP_KERNEL); if (!futexv) return -ENOMEM; ret = futex_parse_waitv(futexv, iof->uwaitv, iof->futex_nr, io_futex_wakev_fn, req); if (ret) { kfree(futexv); return ret; } iof->futexv_owned = 0; iof->futexv_unqueued = 0; req->flags |= REQ_F_ASYNC_DATA; req->async_data = futexv; return 0; } static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q) { struct io_futex_data *ifd = container_of(q, struct io_futex_data, q); struct io_kiocb *req = ifd->req; if (unlikely(!__futex_wake_mark(q))) return; io_req_set_res(req, 0, 0); req->io_task_work.func = io_futex_complete; io_req_task_work_add(req); } static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx) { struct io_cache_entry *entry; entry = io_alloc_cache_get(&ctx->futex_cache); if (entry) return container_of(entry, struct io_futex_data, cache); return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT); } int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct futex_vector *futexv = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret, woken = -1; io_ring_submit_lock(ctx, issue_flags); ret = futex_wait_multiple_setup(futexv, iof->futex_nr, &woken); /* * Error case, ret is < 0. Mark the request as failed. */ if (unlikely(ret < 0)) { io_ring_submit_unlock(ctx, issue_flags); req_set_fail(req); io_req_set_res(req, ret, 0); kfree(futexv); req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; return IOU_OK; } /* * 0 return means that we successfully setup the waiters, and that * nobody triggered a wakeup while we were doing so. If the wakeup * happened post setup, the task_work will be run post this issue and * under the submission lock. 1 means We got woken while setting up, * let that side do the completion. Note that * futex_wait_multiple_setup() will have unqueued all the futexes in * this case. Mark us as having done that already, since this is * different from normal wakeup. */ if (!ret) { /* * If futex_wait_multiple_setup() returns 0 for a * successful setup, then the task state will not be * runnable. This is fine for the sync syscall, as * it'll be blocking unless we already got one of the * futexes woken, but it obviously won't work for an * async invocation. Mark us runnable again. */ __set_current_state(TASK_RUNNING); hlist_add_head(&req->hash_node, &ctx->futex_list); } else { iof->futexv_unqueued = 1; if (woken != -1) io_req_set_res(req, woken, 0); } io_ring_submit_unlock(ctx, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct io_ring_ctx *ctx = req->ctx; struct io_futex_data *ifd = NULL; struct futex_hash_bucket *hb; int ret; if (!iof->futex_mask) { ret = -EINVAL; goto done; } io_ring_submit_lock(ctx, issue_flags); ifd = io_alloc_ifd(ctx); if (!ifd) { ret = -ENOMEM; goto done_unlock; } req->async_data = ifd; ifd->q = futex_q_init; ifd->q.bitset = iof->futex_mask; ifd->q.wake = io_futex_wake_fn; ifd->req = req; ret = futex_wait_setup(iof->uaddr, iof->futex_val, iof->futex_flags, &ifd->q, &hb); if (!ret) { hlist_add_head(&req->hash_node, &ctx->futex_list); io_ring_submit_unlock(ctx, issue_flags); futex_queue(&ifd->q, hb); return IOU_ISSUE_SKIP_COMPLETE; } done_unlock: io_ring_submit_unlock(ctx, issue_flags); done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); kfree(ifd); return IOU_OK; } int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); int ret; /* * Strict flags - ensure that waking 0 futexes yields a 0 result. * See commit 43adf8449510 ("futex: FLAGS_STRICT") for details. */ ret = futex_wake(iof->uaddr, FLAGS_STRICT | iof->futex_flags, iof->futex_val, iof->futex_mask); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; }
31 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 // SPDX-License-Identifier: GPL-2.0 /* Shared Memory Communications Direct over ISM devices (SMC-D) * * Functions for ISM device. * * Copyright IBM Corp. 2018 */ #include <linux/if_vlan.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/slab.h> #include <asm/page.h> #include "smc.h" #include "smc_core.h" #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" #include "linux/ism.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), .mutex = __MUTEX_INITIALIZER(smcd_dev_list.mutex) }; static bool smc_ism_v2_capable; static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; #if IS_ENABLED(CONFIG_ISM) static void smcd_register_dev(struct ism_dev *ism); static void smcd_unregister_dev(struct ism_dev *ism); static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, u16 dmbemask); static struct ism_client smc_ism_client = { .name = "SMC-D", .add = smcd_register_dev, .remove = smcd_unregister_dev, .handle_event = smcd_handle_event, .handle_irq = smcd_handle_irq, }; #endif static void smc_ism_create_system_eid(void) { struct smc_ism_seid *seid = (struct smc_ism_seid *)smc_ism_v2_system_eid; #if IS_ENABLED(CONFIG_S390) struct cpuid id; u16 ident_tail; char tmp[5]; memcpy(seid->seid_string, "IBM-SYSZ-ISMSEID00000000", 24); get_cpu_id(&id); ident_tail = (u16)(id.ident & SMC_ISM_IDENT_MASK); snprintf(tmp, 5, "%04X", ident_tail); memcpy(seid->serial_number, tmp, 4); snprintf(tmp, 5, "%04X", id.machine); memcpy(seid->type, tmp, 4); #else memset(seid, 0, SMC_MAX_EID_LEN); #endif } /* Test if an ISM communication is possible - same CPC */ int smc_ism_cantalk(struct smcd_gid *peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) { return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, vlan_id); } void smc_ism_get_system_eid(u8 **eid) { if (!smc_ism_v2_capable) *eid = NULL; else *eid = smc_ism_v2_system_eid; } u16 smc_ism_get_chid(struct smcd_dev *smcd) { return smcd->ops->get_chid(smcd); } /* HW supports ISM V2 and thus System EID is defined */ bool smc_ism_is_v2_capable(void) { return smc_ism_v2_capable; } /* Set a connection using this DMBE. */ void smc_ism_set_conn(struct smc_connection *conn) { unsigned long flags; spin_lock_irqsave(&conn->lgr->smcd->lock, flags); conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = conn; spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); } /* Unset a connection using this DMBE. */ void smc_ism_unset_conn(struct smc_connection *conn) { unsigned long flags; if (!conn->rmb_desc) return; spin_lock_irqsave(&conn->lgr->smcd->lock, flags); conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = NULL; spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); } /* Register a VLAN identifier with the ISM device. Use a reference count * and add a VLAN identifier only when the first DMB using this VLAN is * registered. */ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) { struct smc_ism_vlanid *new_vlan, *vlan; unsigned long flags; int rc = 0; if (!vlanid) /* No valid vlan id */ return -EINVAL; /* create new vlan entry, in case we need it */ new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL); if (!new_vlan) return -ENOMEM; new_vlan->vlanid = vlanid; refcount_set(&new_vlan->refcnt, 1); /* if there is an existing entry, increase count and return */ spin_lock_irqsave(&smcd->lock, flags); list_for_each_entry(vlan, &smcd->vlan, list) { if (vlan->vlanid == vlanid) { refcount_inc(&vlan->refcnt); kfree(new_vlan); goto out; } } /* no existing entry found. * add new entry to device; might fail, e.g., if HW limit reached */ if (smcd->ops->add_vlan_id(smcd, vlanid)) { kfree(new_vlan); rc = -EIO; goto out; } list_add_tail(&new_vlan->list, &smcd->vlan); out: spin_unlock_irqrestore(&smcd->lock, flags); return rc; } /* Unregister a VLAN identifier with the ISM device. Use a reference count * and remove a VLAN identifier only when the last DMB using this VLAN is * unregistered. */ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) { struct smc_ism_vlanid *vlan; unsigned long flags; bool found = false; int rc = 0; if (!vlanid) /* No valid vlan id */ return -EINVAL; spin_lock_irqsave(&smcd->lock, flags); list_for_each_entry(vlan, &smcd->vlan, list) { if (vlan->vlanid == vlanid) { if (!refcount_dec_and_test(&vlan->refcnt)) goto out; found = true; break; } } if (!found) { rc = -ENOENT; goto out; /* VLAN id not in table */ } /* Found and the last reference just gone */ if (smcd->ops->del_vlan_id(smcd, vlanid)) rc = -EIO; list_del(&vlan->list); kfree(vlan); out: spin_unlock_irqrestore(&smcd->lock, flags); return rc; } int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) { struct smcd_dmb dmb; int rc = 0; if (!dmb_desc->dma_addr) return rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; dmb.sba_idx = dmb_desc->sba_idx; dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; rc = smcd->ops->unregister_dmb(smcd, &dmb); if (!rc || rc == ISM_ERROR) { dmb_desc->cpu_addr = NULL; dmb_desc->dma_addr = 0; } return rc; } int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { #if IS_ENABLED(CONFIG_ISM) struct smcd_dmb dmb; int rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_len = dmb_len; dmb.sba_idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; dmb.rgid = lgr->peer_gid.gid; rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, &smc_ism_client); if (!rc) { dmb_desc->sba_idx = dmb.sba_idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; dmb_desc->len = dmb.dmb_len; } return rc; #else return 0; #endif } static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, struct sk_buff *skb, struct netlink_callback *cb) { char smc_pnet[SMC_MAX_PNETID_LEN + 1]; struct smc_pci_dev smc_pci_dev; struct nlattr *port_attrs; struct nlattr *attrs; struct ism_dev *ism; int use_cnt = 0; void *nlh; ism = smcd->priv; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_DEV_SMCD); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCD); if (!attrs) goto errout; use_cnt = atomic_read(&smcd->lgr_cnt); if (nla_put_u32(skb, SMC_NLA_DEV_USE_CNT, use_cnt)) goto errattr; if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); smc_set_pci_values(to_pci_dev(ism->dev.parent), &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev.pci_vendor)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev.pci_device)) goto errattr; if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev.pci_id)) goto errattr; port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT); if (!port_attrs) goto errattr; if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, smcd->pnetid_by_user)) goto errportattr; memcpy(smc_pnet, smcd->pnetid, SMC_MAX_PNETID_LEN); smc_pnet[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) goto errportattr; nla_nest_end(skb, port_attrs); nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); return 0; errportattr: nla_nest_cancel(skb, port_attrs); errattr: nla_nest_cancel(skb, attrs); errout: nlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); int snum = cb_ctx->pos[0]; struct smcd_dev *smcd; int num = 0; mutex_lock(&dev_list->mutex); list_for_each_entry(smcd, &dev_list->list, list) { if (num < snum) goto next; if (smc_nl_handle_smcd_dev(smcd, skb, cb)) goto errout; next: num++; } errout: mutex_unlock(&dev_list->mutex); cb_ctx->pos[0] = num; } int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) { smc_nl_prep_smcd_dev(&smcd_dev_list, skb, cb); return skb->len; } #if IS_ENABLED(CONFIG_ISM) struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; struct ism_event event; }; #define ISM_EVENT_REQUEST 0x0001 #define ISM_EVENT_RESPONSE 0x0002 #define ISM_EVENT_REQUEST_IR 0x00000001 #define ISM_EVENT_CODE_SHUTDOWN 0x80 #define ISM_EVENT_CODE_TESTLINK 0x83 union smcd_sw_event_info { u64 info; struct { u8 uid[SMC_LGR_ID_SIZE]; unsigned short vlan_id; u16 code; }; }; static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) { struct smcd_gid peer_gid = { .gid = wrk->event.tok, .gid_ext = 0 }; union smcd_sw_event_info ev_info; ev_info.info = wrk->event.info; switch (wrk->event.code) { case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */ smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id); break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ if (ev_info.code == ISM_EVENT_REQUEST) { ev_info.code = ISM_EVENT_RESPONSE; wrk->smcd->ops->signal_event(wrk->smcd, &peer_gid, ISM_EVENT_REQUEST_IR, ISM_EVENT_CODE_TESTLINK, ev_info.info); } break; } } /* worker for SMC-D events */ static void smc_ism_event_work(struct work_struct *work) { struct smc_ism_event_work *wrk = container_of(work, struct smc_ism_event_work, work); struct smcd_gid smcd_gid = { .gid = wrk->event.tok, .gid_ext = 0 }; switch (wrk->event.type) { case ISM_EVENT_GID: /* GID event, token is peer GID */ smc_smcd_terminate(wrk->smcd, &smcd_gid, VLAN_VID_MASK); break; case ISM_EVENT_DMB: break; case ISM_EVENT_SWR: /* Software defined event */ smcd_handle_sw_event(wrk); break; } kfree(wrk); } static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, const struct smcd_ops *ops, int max_dmbs) { struct smcd_dev *smcd; smcd = devm_kzalloc(parent, sizeof(*smcd), GFP_KERNEL); if (!smcd) return NULL; smcd->conn = devm_kcalloc(parent, max_dmbs, sizeof(struct smc_connection *), GFP_KERNEL); if (!smcd->conn) return NULL; smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); if (!smcd->event_wq) return NULL; smcd->ops = ops; spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); INIT_LIST_HEAD(&smcd->vlan); INIT_LIST_HEAD(&smcd->lgr_list); init_waitqueue_head(&smcd->lgrs_deleted); return smcd; } static void smcd_register_dev(struct ism_dev *ism) { const struct smcd_ops *ops = ism_get_smcd_ops(); struct smcd_dev *smcd; if (!ops) return; smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops, ISM_NR_DMBS); if (!smcd) return; smcd->priv = ism; ism_set_priv(ism, &smc_ism_client, smcd); if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); mutex_lock(&smcd_dev_list.mutex); if (list_empty(&smcd_dev_list.list)) { if (smcd->ops->supports_v2()) smc_ism_v2_capable = true; } /* sort list: devices without pnetid before devices with pnetid */ if (smcd->pnetid[0]) list_add_tail(&smcd->list, &smcd_dev_list.list); else list_add(&smcd->list, &smcd_dev_list.list); mutex_unlock(&smcd_dev_list.mutex); pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", dev_name(&ism->dev), smcd->pnetid, smcd->pnetid_by_user ? " (user defined)" : ""); return; } static void smcd_unregister_dev(struct ism_dev *ism) { struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); pr_warn_ratelimited("smc: removing smcd device %s\n", dev_name(&ism->dev)); smcd->going_away = 1; smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); destroy_workqueue(smcd->event_wq); } /* SMCD Device event handler. Called from ISM device interrupt handler. * Parameters are ism device pointer, * - event->type (0 --> DMB, 1 --> GID), * - event->code (event code), * - event->tok (either DMB token when event type 0, or GID when event type 1) * - event->time (time of day) * - event->info (debug info). * * Context: * - Function called in IRQ context from ISM device driver event handler. */ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) { struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); struct smc_ism_event_work *wrk; if (smcd->going_away) return; /* copy event to event work queue, and let it be handled there */ wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); if (!wrk) return; INIT_WORK(&wrk->work, smc_ism_event_work); wrk->smcd = smcd; wrk->event = *event; queue_work(smcd->event_wq, &wrk->work); } /* SMCD Device interrupt handler. Called from ISM device interrupt handler. * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. * Find the connection and schedule the tasklet for this connection. * * Context: * - Function called in IRQ context from ISM device driver IRQ handler. */ static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, u16 dmbemask) { struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); struct smc_connection *conn = NULL; unsigned long flags; spin_lock_irqsave(&smcd->lock, flags); conn = smcd->conn[dmbno]; if (conn && !conn->killed) tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } #endif int smc_ism_signal_shutdown(struct smc_link_group *lgr) { int rc = 0; #if IS_ENABLED(CONFIG_ISM) union smcd_sw_event_info ev_info; if (lgr->peer_shutdown) return 0; memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; ev_info.code = ISM_EVENT_REQUEST; rc = lgr->smcd->ops->signal_event(lgr->smcd, &lgr->peer_gid, ISM_EVENT_REQUEST_IR, ISM_EVENT_CODE_SHUTDOWN, ev_info.info); #endif return rc; } int smc_ism_init(void) { int rc = 0; smc_ism_v2_capable = false; smc_ism_create_system_eid(); #if IS_ENABLED(CONFIG_ISM) rc = ism_register_client(&smc_ism_client); #endif return rc; } void smc_ism_exit(void) { #if IS_ENABLED(CONFIG_ISM) ism_unregister_client(&smc_ism_client); #endif }
3 3 4 4 1 1 3 3 3 3 3 4 2 3 3 3 1 3 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 // SPDX-License-Identifier: GPL-2.0 /* * fs/bfs/dir.c * BFS directory operations. * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com> * Made endianness-clean by Andrew Stribblehill <ads@wompom.org> 2005 */ #include <linux/time.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/sched.h> #include "bfs.h" #undef DEBUG #ifdef DEBUG #define dprintf(x...) printf(x) #else #define dprintf(x...) #endif static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino); static struct buffer_head *bfs_find_entry(struct inode *dir, const struct qstr *child, struct bfs_dirent **res_dir); static int bfs_readdir(struct file *f, struct dir_context *ctx) { struct inode *dir = file_inode(f); struct buffer_head *bh; struct bfs_dirent *de; unsigned int offset; int block; if (ctx->pos & (BFS_DIRENT_SIZE - 1)) { printf("Bad f_pos=%08lx for %s:%08lx\n", (unsigned long)ctx->pos, dir->i_sb->s_id, dir->i_ino); return -EINVAL; } while (ctx->pos < dir->i_size) { offset = ctx->pos & (BFS_BSIZE - 1); block = BFS_I(dir)->i_sblock + (ctx->pos >> BFS_BSIZE_BITS); bh = sb_bread(dir->i_sb, block); if (!bh) { ctx->pos += BFS_BSIZE - offset; continue; } do { de = (struct bfs_dirent *)(bh->b_data + offset); if (de->ino) { int size = strnlen(de->name, BFS_NAMELEN); if (!dir_emit(ctx, de->name, size, le16_to_cpu(de->ino), DT_UNKNOWN)) { brelse(bh); return 0; } } offset += BFS_DIRENT_SIZE; ctx->pos += BFS_DIRENT_SIZE; } while ((offset < BFS_BSIZE) && (ctx->pos < dir->i_size)); brelse(bh); } return 0; } const struct file_operations bfs_dir_operations = { .read = generic_read_dir, .iterate_shared = bfs_readdir, .fsync = generic_file_fsync, .llseek = generic_file_llseek, }; static int bfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { int err; struct inode *inode; struct super_block *s = dir->i_sb; struct bfs_sb_info *info = BFS_SB(s); unsigned long ino; inode = new_inode(s); if (!inode) return -ENOMEM; mutex_lock(&info->bfs_lock); ino = find_first_zero_bit(info->si_imap, info->si_lasti + 1); if (ino > info->si_lasti) { mutex_unlock(&info->bfs_lock); iput(inode); return -ENOSPC; } set_bit(ino, info->si_imap); info->si_freei--; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); simple_inode_init_ts(inode); inode->i_blocks = 0; inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; inode->i_mapping->a_ops = &bfs_aops; inode->i_ino = ino; BFS_I(inode)->i_dsk_ino = ino; BFS_I(inode)->i_sblock = 0; BFS_I(inode)->i_eblock = 0; insert_inode_hash(inode); mark_inode_dirty(inode); bfs_dump_imap("create", s); err = bfs_add_entry(dir, &dentry->d_name, inode->i_ino); if (err) { inode_dec_link_count(inode); mutex_unlock(&info->bfs_lock); iput(inode); return err; } mutex_unlock(&info->bfs_lock); d_instantiate(dentry, inode); return 0; } static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode = NULL; struct buffer_head *bh; struct bfs_dirent *de; struct bfs_sb_info *info = BFS_SB(dir->i_sb); if (dentry->d_name.len > BFS_NAMELEN) return ERR_PTR(-ENAMETOOLONG); mutex_lock(&info->bfs_lock); bh = bfs_find_entry(dir, &dentry->d_name, &de); if (bh) { unsigned long ino = (unsigned long)le16_to_cpu(de->ino); brelse(bh); inode = bfs_iget(dir->i_sb, ino); } mutex_unlock(&info->bfs_lock); return d_splice_alias(inode, dentry); } static int bfs_link(struct dentry *old, struct inode *dir, struct dentry *new) { struct inode *inode = d_inode(old); struct bfs_sb_info *info = BFS_SB(inode->i_sb); int err; mutex_lock(&info->bfs_lock); err = bfs_add_entry(dir, &new->d_name, inode->i_ino); if (err) { mutex_unlock(&info->bfs_lock); return err; } inc_nlink(inode); inode_set_ctime_current(inode); mark_inode_dirty(inode); ihold(inode); d_instantiate(new, inode); mutex_unlock(&info->bfs_lock); return 0; } static int bfs_unlink(struct inode *dir, struct dentry *dentry) { int error = -ENOENT; struct inode *inode = d_inode(dentry); struct buffer_head *bh; struct bfs_dirent *de; struct bfs_sb_info *info = BFS_SB(inode->i_sb); mutex_lock(&info->bfs_lock); bh = bfs_find_entry(dir, &dentry->d_name, &de); if (!bh || (le16_to_cpu(de->ino) != inode->i_ino)) goto out_brelse; if (!inode->i_nlink) { printf("unlinking non-existent file %s:%lu (nlink=%d)\n", inode->i_sb->s_id, inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } de->ino = 0; mark_buffer_dirty_inode(bh, dir); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); error = 0; out_brelse: brelse(bh); mutex_unlock(&info->bfs_lock); return error; } static int bfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode, *new_inode; struct buffer_head *old_bh, *new_bh; struct bfs_dirent *old_de, *new_de; struct bfs_sb_info *info; int error = -ENOENT; if (flags & ~RENAME_NOREPLACE) return -EINVAL; old_bh = new_bh = NULL; old_inode = d_inode(old_dentry); if (S_ISDIR(old_inode->i_mode)) return -EINVAL; info = BFS_SB(old_inode->i_sb); mutex_lock(&info->bfs_lock); old_bh = bfs_find_entry(old_dir, &old_dentry->d_name, &old_de); if (!old_bh || (le16_to_cpu(old_de->ino) != old_inode->i_ino)) goto end_rename; error = -EPERM; new_inode = d_inode(new_dentry); new_bh = bfs_find_entry(new_dir, &new_dentry->d_name, &new_de); if (new_bh && !new_inode) { brelse(new_bh); new_bh = NULL; } if (!new_bh) { error = bfs_add_entry(new_dir, &new_dentry->d_name, old_inode->i_ino); if (error) goto end_rename; } old_de->ino = 0; inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); mark_inode_dirty(old_dir); if (new_inode) { inode_set_ctime_current(new_inode); inode_dec_link_count(new_inode); } mark_buffer_dirty_inode(old_bh, old_dir); error = 0; end_rename: mutex_unlock(&info->bfs_lock); brelse(old_bh); brelse(new_bh); return error; } const struct inode_operations bfs_dir_inops = { .create = bfs_create, .lookup = bfs_lookup, .link = bfs_link, .unlink = bfs_unlink, .rename = bfs_rename, }; static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino) { const unsigned char *name = child->name; int namelen = child->len; struct buffer_head *bh; struct bfs_dirent *de; int block, sblock, eblock, off, pos; int i; dprintf("name=%s, namelen=%d\n", name, namelen); sblock = BFS_I(dir)->i_sblock; eblock = BFS_I(dir)->i_eblock; for (block = sblock; block <= eblock; block++) { bh = sb_bread(dir->i_sb, block); if (!bh) return -EIO; for (off = 0; off < BFS_BSIZE; off += BFS_DIRENT_SIZE) { de = (struct bfs_dirent *)(bh->b_data + off); if (!de->ino) { pos = (block - sblock) * BFS_BSIZE + off; if (pos >= dir->i_size) { dir->i_size += BFS_DIRENT_SIZE; inode_set_ctime_current(dir); } inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); de->ino = cpu_to_le16((u16)ino); for (i = 0; i < BFS_NAMELEN; i++) de->name[i] = (i < namelen) ? name[i] : 0; mark_buffer_dirty_inode(bh, dir); brelse(bh); return 0; } } brelse(bh); } return -ENOSPC; } static inline int bfs_namecmp(int len, const unsigned char *name, const char *buffer) { if ((len < BFS_NAMELEN) && buffer[len]) return 0; return !memcmp(name, buffer, len); } static struct buffer_head *bfs_find_entry(struct inode *dir, const struct qstr *child, struct bfs_dirent **res_dir) { unsigned long block = 0, offset = 0; struct buffer_head *bh = NULL; struct bfs_dirent *de; const unsigned char *name = child->name; int namelen = child->len; *res_dir = NULL; if (namelen > BFS_NAMELEN) return NULL; while (block * BFS_BSIZE + offset < dir->i_size) { if (!bh) { bh = sb_bread(dir->i_sb, BFS_I(dir)->i_sblock + block); if (!bh) { block++; continue; } } de = (struct bfs_dirent *)(bh->b_data + offset); offset += BFS_DIRENT_SIZE; if (le16_to_cpu(de->ino) && bfs_namecmp(namelen, name, de->name)) { *res_dir = de; return bh; } if (offset < bh->b_size) continue; brelse(bh); bh = NULL; offset = 0; block++; } brelse(bh); return NULL; }
561 562 561 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 // SPDX-License-Identifier: GPL-2.0+ /* * Procedures for creating, accessing and interpreting the device tree. * * Paul Mackerras August 1996. * Copyright (C) 1996-2005 Paul Mackerras. * * Adapted for 64bit PowerPC by Dave Engebretsen and Peter Bergner. * {engebret|bergner}@us.ibm.com * * Adapted for sparc and sparc64 by David S. Miller davem@davemloft.net * * Reconsolidated from arch/x/kernel/prom.c by Stephen Rothwell and * Grant Likely. */ #define pr_fmt(fmt) "OF: " fmt #include <linux/console.h> #include <linux/ctype.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/of.h> #include <linux/of_device.h> #include <linux/of_graph.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/proc_fs.h> #include "of_private.h" LIST_HEAD(aliases_lookup); struct device_node *of_root; EXPORT_SYMBOL(of_root); struct device_node *of_chosen; EXPORT_SYMBOL(of_chosen); struct device_node *of_aliases; struct device_node *of_stdout; static const char *of_stdout_options; struct kset *of_kset; /* * Used to protect the of_aliases, to hold off addition of nodes to sysfs. * This mutex must be held whenever modifications are being made to the * device tree. The of_{attach,detach}_node() and * of_{add,remove,update}_property() helpers make sure this happens. */ DEFINE_MUTEX(of_mutex); /* use when traversing tree through the child, sibling, * or parent members of struct device_node. */ DEFINE_RAW_SPINLOCK(devtree_lock); bool of_node_name_eq(const struct device_node *np, const char *name) { const char *node_name; size_t len; if (!np) return false; node_name = kbasename(np->full_name); len = strchrnul(node_name, '@') - node_name; return (strlen(name) == len) && (strncmp(node_name, name, len) == 0); } EXPORT_SYMBOL(of_node_name_eq); bool of_node_name_prefix(const struct device_node *np, const char *prefix) { if (!np) return false; return strncmp(kbasename(np->full_name), prefix, strlen(prefix)) == 0; } EXPORT_SYMBOL(of_node_name_prefix); static bool __of_node_is_type(const struct device_node *np, const char *type) { const char *match = __of_get_property(np, "device_type", NULL); return np && match && type && !strcmp(match, type); } int of_bus_n_addr_cells(struct device_node *np) { u32 cells; for (; np; np = np->parent) if (!of_property_read_u32(np, "#address-cells", &cells)) return cells; /* No #address-cells property for the root node */ return OF_ROOT_NODE_ADDR_CELLS_DEFAULT; } int of_n_addr_cells(struct device_node *np) { if (np->parent) np = np->parent; return of_bus_n_addr_cells(np); } EXPORT_SYMBOL(of_n_addr_cells); int of_bus_n_size_cells(struct device_node *np) { u32 cells; for (; np; np = np->parent) if (!of_property_read_u32(np, "#size-cells", &cells)) return cells; /* No #size-cells property for the root node */ return OF_ROOT_NODE_SIZE_CELLS_DEFAULT; } int of_n_size_cells(struct device_node *np) { if (np->parent) np = np->parent; return of_bus_n_size_cells(np); } EXPORT_SYMBOL(of_n_size_cells); #ifdef CONFIG_NUMA int __weak of_node_to_nid(struct device_node *np) { return NUMA_NO_NODE; } #endif #define OF_PHANDLE_CACHE_BITS 7 #define OF_PHANDLE_CACHE_SZ BIT(OF_PHANDLE_CACHE_BITS) static struct device_node *phandle_cache[OF_PHANDLE_CACHE_SZ]; static u32 of_phandle_cache_hash(phandle handle) { return hash_32(handle, OF_PHANDLE_CACHE_BITS); } /* * Caller must hold devtree_lock. */ void __of_phandle_cache_inv_entry(phandle handle) { u32 handle_hash; struct device_node *np; if (!handle) return; handle_hash = of_phandle_cache_hash(handle); np = phandle_cache[handle_hash]; if (np && handle == np->phandle) phandle_cache[handle_hash] = NULL; } void __init of_core_init(void) { struct device_node *np; of_platform_register_reconfig_notifier(); /* Create the kset, and register existing nodes */ mutex_lock(&of_mutex); of_kset = kset_create_and_add("devicetree", NULL, firmware_kobj); if (!of_kset) { mutex_unlock(&of_mutex); pr_err("failed to register existing nodes\n"); return; } for_each_of_allnodes(np) { __of_attach_node_sysfs(np); if (np->phandle && !phandle_cache[of_phandle_cache_hash(np->phandle)]) phandle_cache[of_phandle_cache_hash(np->phandle)] = np; } mutex_unlock(&of_mutex); /* Symlink in /proc as required by userspace ABI */ if (of_root) proc_symlink("device-tree", NULL, "/sys/firmware/devicetree/base"); } static struct property *__of_find_property(const struct device_node *np, const char *name, int *lenp) { struct property *pp; if (!np) return NULL; for (pp = np->properties; pp; pp = pp->next) { if (of_prop_cmp(pp->name, name) == 0) { if (lenp) *lenp = pp->length; break; } } return pp; } struct property *of_find_property(const struct device_node *np, const char *name, int *lenp) { struct property *pp; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); pp = __of_find_property(np, name, lenp); raw_spin_unlock_irqrestore(&devtree_lock, flags); return pp; } EXPORT_SYMBOL(of_find_property); struct device_node *__of_find_all_nodes(struct device_node *prev) { struct device_node *np; if (!prev) { np = of_root; } else if (prev->child) { np = prev->child; } else { /* Walk back up looking for a sibling, or the end of the structure */ np = prev; while (np->parent && !np->sibling) np = np->parent; np = np->sibling; /* Might be null at the end of the tree */ } return np; } /** * of_find_all_nodes - Get next node in global list * @prev: Previous node or NULL to start iteration * of_node_put() will be called on it * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_find_all_nodes(struct device_node *prev) { struct device_node *np; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); np = __of_find_all_nodes(prev); of_node_get(np); of_node_put(prev); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_all_nodes); /* * Find a property with a given name for a given node * and return the value. */ const void *__of_get_property(const struct device_node *np, const char *name, int *lenp) { struct property *pp = __of_find_property(np, name, lenp); return pp ? pp->value : NULL; } /* * Find a property with a given name for a given node * and return the value. */ const void *of_get_property(const struct device_node *np, const char *name, int *lenp) { struct property *pp = of_find_property(np, name, lenp); return pp ? pp->value : NULL; } EXPORT_SYMBOL(of_get_property); /** * __of_device_is_compatible() - Check if the node matches given constraints * @device: pointer to node * @compat: required compatible string, NULL or "" for any match * @type: required device_type value, NULL or "" for any match * @name: required node name, NULL or "" for any match * * Checks if the given @compat, @type and @name strings match the * properties of the given @device. A constraints can be skipped by * passing NULL or an empty string as the constraint. * * Returns 0 for no match, and a positive integer on match. The return * value is a relative score with larger values indicating better * matches. The score is weighted for the most specific compatible value * to get the highest score. Matching type is next, followed by matching * name. Practically speaking, this results in the following priority * order for matches: * * 1. specific compatible && type && name * 2. specific compatible && type * 3. specific compatible && name * 4. specific compatible * 5. general compatible && type && name * 6. general compatible && type * 7. general compatible && name * 8. general compatible * 9. type && name * 10. type * 11. name */ static int __of_device_is_compatible(const struct device_node *device, const char *compat, const char *type, const char *name) { struct property *prop; const char *cp; int index = 0, score = 0; /* Compatible match has highest priority */ if (compat && compat[0]) { prop = __of_find_property(device, "compatible", NULL); for (cp = of_prop_next_string(prop, NULL); cp; cp = of_prop_next_string(prop, cp), index++) { if (of_compat_cmp(cp, compat, strlen(compat)) == 0) { score = INT_MAX/2 - (index << 2); break; } } if (!score) return 0; } /* Matching type is better than matching name */ if (type && type[0]) { if (!__of_node_is_type(device, type)) return 0; score += 2; } /* Matching name is a bit better than not */ if (name && name[0]) { if (!of_node_name_eq(device, name)) return 0; score++; } return score; } /** Checks if the given "compat" string matches one of the strings in * the device's "compatible" property */ int of_device_is_compatible(const struct device_node *device, const char *compat) { unsigned long flags; int res; raw_spin_lock_irqsave(&devtree_lock, flags); res = __of_device_is_compatible(device, compat, NULL, NULL); raw_spin_unlock_irqrestore(&devtree_lock, flags); return res; } EXPORT_SYMBOL(of_device_is_compatible); /** Checks if the device is compatible with any of the entries in * a NULL terminated array of strings. Returns the best match * score or 0. */ int of_device_compatible_match(const struct device_node *device, const char *const *compat) { unsigned int tmp, score = 0; if (!compat) return 0; while (*compat) { tmp = of_device_is_compatible(device, *compat); if (tmp > score) score = tmp; compat++; } return score; } EXPORT_SYMBOL_GPL(of_device_compatible_match); /** * of_machine_is_compatible - Test root of device tree for a given compatible value * @compat: compatible string to look for in root node's compatible property. * * Return: A positive integer if the root node has the given value in its * compatible property. */ int of_machine_is_compatible(const char *compat) { struct device_node *root; int rc = 0; root = of_find_node_by_path("/"); if (root) { rc = of_device_is_compatible(root, compat); of_node_put(root); } return rc; } EXPORT_SYMBOL(of_machine_is_compatible); /** * __of_device_is_available - check if a device is available for use * * @device: Node to check for availability, with locks already held * * Return: True if the status property is absent or set to "okay" or "ok", * false otherwise */ static bool __of_device_is_available(const struct device_node *device) { const char *status; int statlen; if (!device) return false; status = __of_get_property(device, "status", &statlen); if (status == NULL) return true; if (statlen > 0) { if (!strcmp(status, "okay") || !strcmp(status, "ok")) return true; } return false; } /** * of_device_is_available - check if a device is available for use * * @device: Node to check for availability * * Return: True if the status property is absent or set to "okay" or "ok", * false otherwise */ bool of_device_is_available(const struct device_node *device) { unsigned long flags; bool res; raw_spin_lock_irqsave(&devtree_lock, flags); res = __of_device_is_available(device); raw_spin_unlock_irqrestore(&devtree_lock, flags); return res; } EXPORT_SYMBOL(of_device_is_available); /** * __of_device_is_fail - check if a device has status "fail" or "fail-..." * * @device: Node to check status for, with locks already held * * Return: True if the status property is set to "fail" or "fail-..." (for any * error code suffix), false otherwise */ static bool __of_device_is_fail(const struct device_node *device) { const char *status; if (!device) return false; status = __of_get_property(device, "status", NULL); if (status == NULL) return false; return !strcmp(status, "fail") || !strncmp(status, "fail-", 5); } /** * of_device_is_big_endian - check if a device has BE registers * * @device: Node to check for endianness * * Return: True if the device has a "big-endian" property, or if the kernel * was compiled for BE *and* the device has a "native-endian" property. * Returns false otherwise. * * Callers would nominally use ioread32be/iowrite32be if * of_device_is_big_endian() == true, or readl/writel otherwise. */ bool of_device_is_big_endian(const struct device_node *device) { if (of_property_read_bool(device, "big-endian")) return true; if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) && of_property_read_bool(device, "native-endian")) return true; return false; } EXPORT_SYMBOL(of_device_is_big_endian); /** * of_get_parent - Get a node's parent if any * @node: Node to get parent * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_get_parent(const struct device_node *node) { struct device_node *np; unsigned long flags; if (!node) return NULL; raw_spin_lock_irqsave(&devtree_lock, flags); np = of_node_get(node->parent); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_get_parent); /** * of_get_next_parent - Iterate to a node's parent * @node: Node to get parent of * * This is like of_get_parent() except that it drops the * refcount on the passed node, making it suitable for iterating * through a node's parents. * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_get_next_parent(struct device_node *node) { struct device_node *parent; unsigned long flags; if (!node) return NULL; raw_spin_lock_irqsave(&devtree_lock, flags); parent = of_node_get(node->parent); of_node_put(node); raw_spin_unlock_irqrestore(&devtree_lock, flags); return parent; } EXPORT_SYMBOL(of_get_next_parent); static struct device_node *__of_get_next_child(const struct device_node *node, struct device_node *prev) { struct device_node *next; if (!node) return NULL; next = prev ? prev->sibling : node->child; of_node_get(next); of_node_put(prev); return next; } #define __for_each_child_of_node(parent, child) \ for (child = __of_get_next_child(parent, NULL); child != NULL; \ child = __of_get_next_child(parent, child)) /** * of_get_next_child - Iterate a node childs * @node: parent node * @prev: previous child of the parent node, or NULL to get first * * Return: A node pointer with refcount incremented, use of_node_put() on * it when done. Returns NULL when prev is the last child. Decrements the * refcount of prev. */ struct device_node *of_get_next_child(const struct device_node *node, struct device_node *prev) { struct device_node *next; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); next = __of_get_next_child(node, prev); raw_spin_unlock_irqrestore(&devtree_lock, flags); return next; } EXPORT_SYMBOL(of_get_next_child); /** * of_get_next_available_child - Find the next available child node * @node: parent node * @prev: previous child of the parent node, or NULL to get first * * This function is like of_get_next_child(), except that it * automatically skips any disabled nodes (i.e. status = "disabled"). */ struct device_node *of_get_next_available_child(const struct device_node *node, struct device_node *prev) { struct device_node *next; unsigned long flags; if (!node) return NULL; raw_spin_lock_irqsave(&devtree_lock, flags); next = prev ? prev->sibling : node->child; for (; next; next = next->sibling) { if (!__of_device_is_available(next)) continue; if (of_node_get(next)) break; } of_node_put(prev); raw_spin_unlock_irqrestore(&devtree_lock, flags); return next; } EXPORT_SYMBOL(of_get_next_available_child); /** * of_get_next_cpu_node - Iterate on cpu nodes * @prev: previous child of the /cpus node, or NULL to get first * * Unusable CPUs (those with the status property set to "fail" or "fail-...") * will be skipped. * * Return: A cpu node pointer with refcount incremented, use of_node_put() * on it when done. Returns NULL when prev is the last child. Decrements * the refcount of prev. */ struct device_node *of_get_next_cpu_node(struct device_node *prev) { struct device_node *next = NULL; unsigned long flags; struct device_node *node; if (!prev) node = of_find_node_by_path("/cpus"); raw_spin_lock_irqsave(&devtree_lock, flags); if (prev) next = prev->sibling; else if (node) { next = node->child; of_node_put(node); } for (; next; next = next->sibling) { if (__of_device_is_fail(next)) continue; if (!(of_node_name_eq(next, "cpu") || __of_node_is_type(next, "cpu"))) continue; if (of_node_get(next)) break; } of_node_put(prev); raw_spin_unlock_irqrestore(&devtree_lock, flags); return next; } EXPORT_SYMBOL(of_get_next_cpu_node); /** * of_get_compatible_child - Find compatible child node * @parent: parent node * @compatible: compatible string * * Lookup child node whose compatible property contains the given compatible * string. * * Return: a node pointer with refcount incremented, use of_node_put() on it * when done; or NULL if not found. */ struct device_node *of_get_compatible_child(const struct device_node *parent, const char *compatible) { struct device_node *child; for_each_child_of_node(parent, child) { if (of_device_is_compatible(child, compatible)) break; } return child; } EXPORT_SYMBOL(of_get_compatible_child); /** * of_get_child_by_name - Find the child node by name for a given parent * @node: parent node * @name: child name to look for. * * This function looks for child node for given matching name * * Return: A node pointer if found, with refcount incremented, use * of_node_put() on it when done. * Returns NULL if node is not found. */ struct device_node *of_get_child_by_name(const struct device_node *node, const char *name) { struct device_node *child; for_each_child_of_node(node, child) if (of_node_name_eq(child, name)) break; return child; } EXPORT_SYMBOL(of_get_child_by_name); struct device_node *__of_find_node_by_path(struct device_node *parent, const char *path) { struct device_node *child; int len; len = strcspn(path, "/:"); if (!len) return NULL; __for_each_child_of_node(parent, child) { const char *name = kbasename(child->full_name); if (strncmp(path, name, len) == 0 && (strlen(name) == len)) return child; } return NULL; } struct device_node *__of_find_node_by_full_path(struct device_node *node, const char *path) { const char *separator = strchr(path, ':'); while (node && *path == '/') { struct device_node *tmp = node; path++; /* Increment past '/' delimiter */ node = __of_find_node_by_path(node, path); of_node_put(tmp); path = strchrnul(path, '/'); if (separator && separator < path) break; } return node; } /** * of_find_node_opts_by_path - Find a node matching a full OF path * @path: Either the full path to match, or if the path does not * start with '/', the name of a property of the /aliases * node (an alias). In the case of an alias, the node * matching the alias' value will be returned. * @opts: Address of a pointer into which to store the start of * an options string appended to the end of the path with * a ':' separator. * * Valid paths: * * /foo/bar Full path * * foo Valid alias * * foo/bar Valid alias + relative path * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_find_node_opts_by_path(const char *path, const char **opts) { struct device_node *np = NULL; struct property *pp; unsigned long flags; const char *separator = strchr(path, ':'); if (opts) *opts = separator ? separator + 1 : NULL; if (strcmp(path, "/") == 0) return of_node_get(of_root); /* The path could begin with an alias */ if (*path != '/') { int len; const char *p = separator; if (!p) p = strchrnul(path, '/'); len = p - path; /* of_aliases must not be NULL */ if (!of_aliases) return NULL; for_each_property_of_node(of_aliases, pp) { if (strlen(pp->name) == len && !strncmp(pp->name, path, len)) { np = of_find_node_by_path(pp->value); break; } } if (!np) return NULL; path = p; } /* Step down the tree matching path components */ raw_spin_lock_irqsave(&devtree_lock, flags); if (!np) np = of_node_get(of_root); np = __of_find_node_by_full_path(np, path); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_opts_by_path); /** * of_find_node_by_name - Find a node by its "name" property * @from: The node to start searching from or NULL; the node * you pass will not be searched, only the next one * will. Typically, you pass what the previous call * returned. of_node_put() will be called on @from. * @name: The name string to match against * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_find_node_by_name(struct device_node *from, const char *name) { struct device_node *np; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); for_each_of_allnodes_from(from, np) if (of_node_name_eq(np, name) && of_node_get(np)) break; of_node_put(from); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_by_name); /** * of_find_node_by_type - Find a node by its "device_type" property * @from: The node to start searching from, or NULL to start searching * the entire device tree. The node you pass will not be * searched, only the next one will; typically, you pass * what the previous call returned. of_node_put() will be * called on from for you. * @type: The type string to match against * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_find_node_by_type(struct device_node *from, const char *type) { struct device_node *np; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); for_each_of_allnodes_from(from, np) if (__of_node_is_type(np, type) && of_node_get(np)) break; of_node_put(from); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_by_type); /** * of_find_compatible_node - Find a node based on type and one of the * tokens in its "compatible" property * @from: The node to start searching from or NULL, the node * you pass will not be searched, only the next one * will; typically, you pass what the previous call * returned. of_node_put() will be called on it * @type: The type string to match "device_type" or NULL to ignore * @compatible: The string to match to one of the tokens in the device * "compatible" list. * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_find_compatible_node(struct device_node *from, const char *type, const char *compatible) { struct device_node *np; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); for_each_of_allnodes_from(from, np) if (__of_device_is_compatible(np, compatible, type, NULL) && of_node_get(np)) break; of_node_put(from); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_compatible_node); /** * of_find_node_with_property - Find a node which has a property with * the given name. * @from: The node to start searching from or NULL, the node * you pass will not be searched, only the next one * will; typically, you pass what the previous call * returned. of_node_put() will be called on it * @prop_name: The name of the property to look for. * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_find_node_with_property(struct device_node *from, const char *prop_name) { struct device_node *np; struct property *pp; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); for_each_of_allnodes_from(from, np) { for (pp = np->properties; pp; pp = pp->next) { if (of_prop_cmp(pp->name, prop_name) == 0) { of_node_get(np); goto out; } } } out: of_node_put(from); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_with_property); static const struct of_device_id *__of_match_node(const struct of_device_id *matches, const struct device_node *node) { const struct of_device_id *best_match = NULL; int score, best_score = 0; if (!matches) return NULL; for (; matches->name[0] || matches->type[0] || matches->compatible[0]; matches++) { score = __of_device_is_compatible(node, matches->compatible, matches->type, matches->name); if (score > best_score) { best_match = matches; best_score = score; } } return best_match; } /** * of_match_node - Tell if a device_node has a matching of_match structure * @matches: array of of device match structures to search in * @node: the of device structure to match against * * Low level utility function used by device matching. */ const struct of_device_id *of_match_node(const struct of_device_id *matches, const struct device_node *node) { const struct of_device_id *match; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); match = __of_match_node(matches, node); raw_spin_unlock_irqrestore(&devtree_lock, flags); return match; } EXPORT_SYMBOL(of_match_node); /** * of_find_matching_node_and_match - Find a node based on an of_device_id * match table. * @from: The node to start searching from or NULL, the node * you pass will not be searched, only the next one * will; typically, you pass what the previous call * returned. of_node_put() will be called on it * @matches: array of of device match structures to search in * @match: Updated to point at the matches entry which matched * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_find_matching_node_and_match(struct device_node *from, const struct of_device_id *matches, const struct of_device_id **match) { struct device_node *np; const struct of_device_id *m; unsigned long flags; if (match) *match = NULL; raw_spin_lock_irqsave(&devtree_lock, flags); for_each_of_allnodes_from(from, np) { m = __of_match_node(matches, np); if (m && of_node_get(np)) { if (match) *match = m; break; } } of_node_put(from); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_matching_node_and_match); /** * of_alias_from_compatible - Lookup appropriate alias for a device node * depending on compatible * @node: pointer to a device tree node * @alias: Pointer to buffer that alias value will be copied into * @len: Length of alias value * * Based on the value of the compatible property, this routine will attempt * to choose an appropriate alias value for a particular device tree node. * It does this by stripping the manufacturer prefix (as delimited by a ',') * from the first entry in the compatible list property. * * Note: The matching on just the "product" side of the compatible is a relic * from I2C and SPI. Please do not add any new user. * * Return: This routine returns 0 on success, <0 on failure. */ int of_alias_from_compatible(const struct device_node *node, char *alias, int len) { const char *compatible, *p; int cplen; compatible = of_get_property(node, "compatible", &cplen); if (!compatible || strlen(compatible) > cplen) return -ENODEV; p = strchr(compatible, ','); strscpy(alias, p ? p + 1 : compatible, len); return 0; } EXPORT_SYMBOL_GPL(of_alias_from_compatible); /** * of_find_node_by_phandle - Find a node given a phandle * @handle: phandle of the node to find * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. */ struct device_node *of_find_node_by_phandle(phandle handle) { struct device_node *np = NULL; unsigned long flags; u32 handle_hash; if (!handle) return NULL; handle_hash = of_phandle_cache_hash(handle); raw_spin_lock_irqsave(&devtree_lock, flags); if (phandle_cache[handle_hash] && handle == phandle_cache[handle_hash]->phandle) np = phandle_cache[handle_hash]; if (!np) { for_each_of_allnodes(np) if (np->phandle == handle && !of_node_check_flag(np, OF_DETACHED)) { phandle_cache[handle_hash] = np; break; } } of_node_get(np); raw_spin_unlock_irqrestore(&devtree_lock, flags); return np; } EXPORT_SYMBOL(of_find_node_by_phandle); void of_print_phandle_args(const char *msg, const struct of_phandle_args *args) { int i; printk("%s %pOF", msg, args->np); for (i = 0; i < args->args_count; i++) { const char delim = i ? ',' : ':'; pr_cont("%c%08x", delim, args->args[i]); } pr_cont("\n"); } int of_phandle_iterator_init(struct of_phandle_iterator *it, const struct device_node *np, const char *list_name, const char *cells_name, int cell_count) { const __be32 *list; int size; memset(it, 0, sizeof(*it)); /* * one of cell_count or cells_name must be provided to determine the * argument length. */ if (cell_count < 0 && !cells_name) return -EINVAL; list = of_get_property(np, list_name, &size); if (!list) return -ENOENT; it->cells_name = cells_name; it->cell_count = cell_count; it->parent = np; it->list_end = list + size / sizeof(*list); it->phandle_end = list; it->cur = list; return 0; } EXPORT_SYMBOL_GPL(of_phandle_iterator_init); int of_phandle_iterator_next(struct of_phandle_iterator *it) { uint32_t count = 0; if (it->node) { of_node_put(it->node); it->node = NULL; } if (!it->cur || it->phandle_end >= it->list_end) return -ENOENT; it->cur = it->phandle_end; /* If phandle is 0, then it is an empty entry with no arguments. */ it->phandle = be32_to_cpup(it->cur++); if (it->phandle) { /* * Find the provider node and parse the #*-cells property to * determine the argument length. */ it->node = of_find_node_by_phandle(it->phandle); if (it->cells_name) { if (!it->node) { pr_err("%pOF: could not find phandle %d\n", it->parent, it->phandle); goto err; } if (of_property_read_u32(it->node, it->cells_name, &count)) { /* * If both cell_count and cells_name is given, * fall back to cell_count in absence * of the cells_name property */ if (it->cell_count >= 0) { count = it->cell_count; } else { pr_err("%pOF: could not get %s for %pOF\n", it->parent, it->cells_name, it->node); goto err; } } } else { count = it->cell_count; } /* * Make sure that the arguments actually fit in the remaining * property data length */ if (it->cur + count > it->list_end) { if (it->cells_name) pr_err("%pOF: %s = %d found %td\n", it->parent, it->cells_name, count, it->list_end - it->cur); else pr_err("%pOF: phandle %s needs %d, found %td\n", it->parent, of_node_full_name(it->node), count, it->list_end - it->cur); goto err; } } it->phandle_end = it->cur + count; it->cur_count = count; return 0; err: if (it->node) { of_node_put(it->node); it->node = NULL; } return -EINVAL; } EXPORT_SYMBOL_GPL(of_phandle_iterator_next); int of_phandle_iterator_args(struct of_phandle_iterator *it, uint32_t *args, int size) { int i, count; count = it->cur_count; if (WARN_ON(size < count)) count = size; for (i = 0; i < count; i++) args[i] = be32_to_cpup(it->cur++); return count; } int __of_parse_phandle_with_args(const struct device_node *np, const char *list_name, const char *cells_name, int cell_count, int index, struct of_phandle_args *out_args) { struct of_phandle_iterator it; int rc, cur_index = 0; if (index < 0) return -EINVAL; /* Loop over the phandles until all the requested entry is found */ of_for_each_phandle(&it, rc, np, list_name, cells_name, cell_count) { /* * All of the error cases bail out of the loop, so at * this point, the parsing is successful. If the requested * index matches, then fill the out_args structure and return, * or return -ENOENT for an empty entry. */ rc = -ENOENT; if (cur_index == index) { if (!it.phandle) goto err; if (out_args) { int c; c = of_phandle_iterator_args(&it, out_args->args, MAX_PHANDLE_ARGS); out_args->np = it.node; out_args->args_count = c; } else { of_node_put(it.node); } /* Found it! return success */ return 0; } cur_index++; } /* * Unlock node before returning result; will be one of: * -ENOENT : index is for empty phandle * -EINVAL : parsing error on data */ err: of_node_put(it.node); return rc; } EXPORT_SYMBOL(__of_parse_phandle_with_args); /** * of_parse_phandle_with_args_map() - Find a node pointed by phandle in a list and remap it * @np: pointer to a device tree node containing a list * @list_name: property name that contains a list * @stem_name: stem of property names that specify phandles' arguments count * @index: index of a phandle to parse out * @out_args: optional pointer to output arguments structure (will be filled) * * This function is useful to parse lists of phandles and their arguments. * Returns 0 on success and fills out_args, on error returns appropriate errno * value. The difference between this function and of_parse_phandle_with_args() * is that this API remaps a phandle if the node the phandle points to has * a <@stem_name>-map property. * * Caller is responsible to call of_node_put() on the returned out_args->np * pointer. * * Example:: * * phandle1: node1 { * #list-cells = <2>; * }; * * phandle2: node2 { * #list-cells = <1>; * }; * * phandle3: node3 { * #list-cells = <1>; * list-map = <0 &phandle2 3>, * <1 &phandle2 2>, * <2 &phandle1 5 1>; * list-map-mask = <0x3>; * }; * * node4 { * list = <&phandle1 1 2 &phandle3 0>; * }; * * To get a device_node of the ``node2`` node you may call this: * of_parse_phandle_with_args(node4, "list", "list", 1, &args); */ int of_parse_phandle_with_args_map(const struct device_node *np, const char *list_name, const char *stem_name, int index, struct of_phandle_args *out_args) { char *cells_name, *map_name = NULL, *mask_name = NULL; char *pass_name = NULL; struct device_node *cur, *new = NULL; const __be32 *map, *mask, *pass; static const __be32 dummy_mask[] = { [0 ... MAX_PHANDLE_ARGS] = ~0 }; static const __be32 dummy_pass[] = { [0 ... MAX_PHANDLE_ARGS] = 0 }; __be32 initial_match_array[MAX_PHANDLE_ARGS]; const __be32 *match_array = initial_match_array; int i, ret, map_len, match; u32 list_size, new_size; if (index < 0) return -EINVAL; cells_name = kasprintf(GFP_KERNEL, "#%s-cells", stem_name); if (!cells_name) return -ENOMEM; ret = -ENOMEM; map_name = kasprintf(GFP_KERNEL, "%s-map", stem_name); if (!map_name) goto free; mask_name = kasprintf(GFP_KERNEL, "%s-map-mask", stem_name); if (!mask_name) goto free; pass_name = kasprintf(GFP_KERNEL, "%s-map-pass-thru", stem_name); if (!pass_name) goto free; ret = __of_parse_phandle_with_args(np, list_name, cells_name, -1, index, out_args); if (ret) goto free; /* Get the #<list>-cells property */ cur = out_args->np; ret = of_property_read_u32(cur, cells_name, &list_size); if (ret < 0) goto put; /* Precalculate the match array - this simplifies match loop */ for (i = 0; i < list_size; i++) initial_match_array[i] = cpu_to_be32(out_args->args[i]); ret = -EINVAL; while (cur) { /* Get the <list>-map property */ map = of_get_property(cur, map_name, &map_len); if (!map) { ret = 0; goto free; } map_len /= sizeof(u32); /* Get the <list>-map-mask property (optional) */ mask = of_get_property(cur, mask_name, NULL); if (!mask) mask = dummy_mask; /* Iterate through <list>-map property */ match = 0; while (map_len > (list_size + 1) && !match) { /* Compare specifiers */ match = 1; for (i = 0; i < list_size; i++, map_len--) match &= !((match_array[i] ^ *map++) & mask[i]); of_node_put(new); new = of_find_node_by_phandle(be32_to_cpup(map)); map++; map_len--; /* Check if not found */ if (!new) goto put; if (!of_device_is_available(new)) match = 0; ret = of_property_read_u32(new, cells_name, &new_size); if (ret) goto put; /* Check for malformed properties */ if (WARN_ON(new_size > MAX_PHANDLE_ARGS)) goto put; if (map_len < new_size) goto put; /* Move forward by new node's #<list>-cells amount */ map += new_size; map_len -= new_size; } if (!match) goto put; /* Get the <list>-map-pass-thru property (optional) */ pass = of_get_property(cur, pass_name, NULL); if (!pass) pass = dummy_pass; /* * Successfully parsed a <list>-map translation; copy new * specifier into the out_args structure, keeping the * bits specified in <list>-map-pass-thru. */ match_array = map - new_size; for (i = 0; i < new_size; i++) { __be32 val = *(map - new_size + i); if (i < list_size) { val &= ~pass[i]; val |= cpu_to_be32(out_args->args[i]) & pass[i]; } out_args->args[i] = be32_to_cpu(val); } out_args->args_count = list_size = new_size; /* Iterate again with new provider */ out_args->np = new; of_node_put(cur); cur = new; new = NULL; } put: of_node_put(cur); of_node_put(new); free: kfree(mask_name); kfree(map_name); kfree(cells_name); kfree(pass_name); return ret; } EXPORT_SYMBOL(of_parse_phandle_with_args_map); /** * of_count_phandle_with_args() - Find the number of phandles references in a property * @np: pointer to a device tree node containing a list * @list_name: property name that contains a list * @cells_name: property name that specifies phandles' arguments count * * Return: The number of phandle + argument tuples within a property. It * is a typical pattern to encode a list of phandle and variable * arguments into a single property. The number of arguments is encoded * by a property in the phandle-target node. For example, a gpios * property would contain a list of GPIO specifies consisting of a * phandle and 1 or more arguments. The number of arguments are * determined by the #gpio-cells property in the node pointed to by the * phandle. */ int of_count_phandle_with_args(const struct device_node *np, const char *list_name, const char *cells_name) { struct of_phandle_iterator it; int rc, cur_index = 0; /* * If cells_name is NULL we assume a cell count of 0. This makes * counting the phandles trivial as each 32bit word in the list is a * phandle and no arguments are to consider. So we don't iterate through * the list but just use the length to determine the phandle count. */ if (!cells_name) { const __be32 *list; int size; list = of_get_property(np, list_name, &size); if (!list) return -ENOENT; return size / sizeof(*list); } rc = of_phandle_iterator_init(&it, np, list_name, cells_name, -1); if (rc) return rc; while ((rc = of_phandle_iterator_next(&it)) == 0) cur_index += 1; if (rc != -ENOENT) return rc; return cur_index; } EXPORT_SYMBOL(of_count_phandle_with_args); static struct property *__of_remove_property_from_list(struct property **list, struct property *prop) { struct property **next; for (next = list; *next; next = &(*next)->next) { if (*next == prop) { *next = prop->next; prop->next = NULL; return prop; } } return NULL; } /** * __of_add_property - Add a property to a node without lock operations * @np: Caller's Device Node * @prop: Property to add */ int __of_add_property(struct device_node *np, struct property *prop) { int rc = 0; unsigned long flags; struct property **next; raw_spin_lock_irqsave(&devtree_lock, flags); __of_remove_property_from_list(&np->deadprops, prop); prop->next = NULL; next = &np->properties; while (*next) { if (strcmp(prop->name, (*next)->name) == 0) { /* duplicate ! don't insert it */ rc = -EEXIST; goto out_unlock; } next = &(*next)->next; } *next = prop; out_unlock: raw_spin_unlock_irqrestore(&devtree_lock, flags); if (rc) return rc; __of_add_property_sysfs(np, prop); return 0; } /** * of_add_property - Add a property to a node * @np: Caller's Device Node * @prop: Property to add */ int of_add_property(struct device_node *np, struct property *prop) { int rc; mutex_lock(&of_mutex); rc = __of_add_property(np, prop); mutex_unlock(&of_mutex); if (!rc) of_property_notify(OF_RECONFIG_ADD_PROPERTY, np, prop, NULL); return rc; } EXPORT_SYMBOL_GPL(of_add_property); int __of_remove_property(struct device_node *np, struct property *prop) { unsigned long flags; int rc = -ENODEV; raw_spin_lock_irqsave(&devtree_lock, flags); if (__of_remove_property_from_list(&np->properties, prop)) { /* Found the property, add it to deadprops list */ prop->next = np->deadprops; np->deadprops = prop; rc = 0; } raw_spin_unlock_irqrestore(&devtree_lock, flags); if (rc) return rc; __of_remove_property_sysfs(np, prop); return 0; } /** * of_remove_property - Remove a property from a node. * @np: Caller's Device Node * @prop: Property to remove * * Note that we don't actually remove it, since we have given out * who-knows-how-many pointers to the data using get-property. * Instead we just move the property to the "dead properties" * list, so it won't be found any more. */ int of_remove_property(struct device_node *np, struct property *prop) { int rc; if (!prop) return -ENODEV; mutex_lock(&of_mutex); rc = __of_remove_property(np, prop); mutex_unlock(&of_mutex); if (!rc) of_property_notify(OF_RECONFIG_REMOVE_PROPERTY, np, prop, NULL); return rc; } EXPORT_SYMBOL_GPL(of_remove_property); int __of_update_property(struct device_node *np, struct property *newprop, struct property **oldpropp) { struct property **next, *oldprop; unsigned long flags; raw_spin_lock_irqsave(&devtree_lock, flags); __of_remove_property_from_list(&np->deadprops, newprop); for (next = &np->properties; *next; next = &(*next)->next) { if (of_prop_cmp((*next)->name, newprop->name) == 0) break; } *oldpropp = oldprop = *next; if (oldprop) { /* replace the node */ newprop->next = oldprop->next; *next = newprop; oldprop->next = np->deadprops; np->deadprops = oldprop; } else { /* new node */ newprop->next = NULL; *next = newprop; } raw_spin_unlock_irqrestore(&devtree_lock, flags); __of_update_property_sysfs(np, newprop, oldprop); return 0; } /* * of_update_property - Update a property in a node, if the property does * not exist, add it. * * Note that we don't actually remove it, since we have given out * who-knows-how-many pointers to the data using get-property. * Instead we just move the property to the "dead properties" list, * and add the new property to the property list */ int of_update_property(struct device_node *np, struct property *newprop) { struct property *oldprop; int rc; if (!newprop->name) return -EINVAL; mutex_lock(&of_mutex); rc = __of_update_property(np, newprop, &oldprop); mutex_unlock(&of_mutex); if (!rc) of_property_notify(OF_RECONFIG_UPDATE_PROPERTY, np, newprop, oldprop); return rc; } static void of_alias_add(struct alias_prop *ap, struct device_node *np, int id, const char *stem, int stem_len) { ap->np = np; ap->id = id; strscpy(ap->stem, stem, stem_len + 1); list_add_tail(&ap->link, &aliases_lookup); pr_debug("adding DT alias:%s: stem=%s id=%i node=%pOF\n", ap->alias, ap->stem, ap->id, np); } /** * of_alias_scan - Scan all properties of the 'aliases' node * @dt_alloc: An allocator that provides a virtual address to memory * for storing the resulting tree * * The function scans all the properties of the 'aliases' node and populates * the global lookup table with the properties. It returns the * number of alias properties found, or an error code in case of failure. */ void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align)) { struct property *pp; of_aliases = of_find_node_by_path("/aliases"); of_chosen = of_find_node_by_path("/chosen"); if (of_chosen == NULL) of_chosen = of_find_node_by_path("/chosen@0"); if (of_chosen) { /* linux,stdout-path and /aliases/stdout are for legacy compatibility */ const char *name = NULL; if (of_property_read_string(of_chosen, "stdout-path", &name)) of_property_read_string(of_chosen, "linux,stdout-path", &name); if (IS_ENABLED(CONFIG_PPC) && !name) of_property_read_string(of_aliases, "stdout", &name); if (name) of_stdout = of_find_node_opts_by_path(name, &of_stdout_options); if (of_stdout) of_stdout->fwnode.flags |= FWNODE_FLAG_BEST_EFFORT; } if (!of_aliases) return; for_each_property_of_node(of_aliases, pp) { const char *start = pp->name; const char *end = start + strlen(start); struct device_node *np; struct alias_prop *ap; int id, len; /* Skip those we do not want to proceed */ if (!strcmp(pp->name, "name") || !strcmp(pp->name, "phandle") || !strcmp(pp->name, "linux,phandle")) continue; np = of_find_node_by_path(pp->value); if (!np) continue; /* walk the alias backwards to extract the id and work out * the 'stem' string */ while (isdigit(*(end-1)) && end > start) end--; len = end - start; if (kstrtoint(end, 10, &id) < 0) continue; /* Allocate an alias_prop with enough space for the stem */ ap = dt_alloc(sizeof(*ap) + len + 1, __alignof__(*ap)); if (!ap) continue; memset(ap, 0, sizeof(*ap) + len + 1); ap->alias = start; of_alias_add(ap, np, id, start, len); } } /** * of_alias_get_id - Get alias id for the given device_node * @np: Pointer to the given device_node * @stem: Alias stem of the given device_node * * The function travels the lookup table to get the alias id for the given * device_node and alias stem. * * Return: The alias id if found. */ int of_alias_get_id(struct device_node *np, const char *stem) { struct alias_prop *app; int id = -ENODEV; mutex_lock(&of_mutex); list_for_each_entry(app, &aliases_lookup, link) { if (strcmp(app->stem, stem) != 0) continue; if (np == app->np) { id = app->id; break; } } mutex_unlock(&of_mutex); return id; } EXPORT_SYMBOL_GPL(of_alias_get_id); /** * of_alias_get_highest_id - Get highest alias id for the given stem * @stem: Alias stem to be examined * * The function travels the lookup table to get the highest alias id for the * given alias stem. It returns the alias id if found. */ int of_alias_get_highest_id(const char *stem) { struct alias_prop *app; int id = -ENODEV; mutex_lock(&of_mutex); list_for_each_entry(app, &aliases_lookup, link) { if (strcmp(app->stem, stem) != 0) continue; if (app->id > id) id = app->id; } mutex_unlock(&of_mutex); return id; } EXPORT_SYMBOL_GPL(of_alias_get_highest_id); /** * of_console_check() - Test and setup console for DT setup * @dn: Pointer to device node * @name: Name to use for preferred console without index. ex. "ttyS" * @index: Index to use for preferred console. * * Check if the given device node matches the stdout-path property in the * /chosen node. If it does then register it as the preferred console. * * Return: TRUE if console successfully setup. Otherwise return FALSE. */ bool of_console_check(struct device_node *dn, char *name, int index) { if (!dn || dn != of_stdout || console_set_on_cmdline) return false; /* * XXX: cast `options' to char pointer to suppress complication * warnings: printk, UART and console drivers expect char pointer. */ return !add_preferred_console(name, index, (char *)of_stdout_options); } EXPORT_SYMBOL_GPL(of_console_check); /** * of_find_next_cache_node - Find a node's subsidiary cache * @np: node of type "cpu" or "cache" * * Return: A node pointer with refcount incremented, use * of_node_put() on it when done. Caller should hold a reference * to np. */ struct device_node *of_find_next_cache_node(const struct device_node *np) { struct device_node *child, *cache_node; cache_node = of_parse_phandle(np, "l2-cache", 0); if (!cache_node) cache_node = of_parse_phandle(np, "next-level-cache", 0); if (cache_node) return cache_node; /* OF on pmac has nodes instead of properties named "l2-cache" * beneath CPU nodes. */ if (IS_ENABLED(CONFIG_PPC_PMAC) && of_node_is_type(np, "cpu")) for_each_child_of_node(np, child) if (of_node_is_type(child, "cache")) return child; return NULL; } /** * of_find_last_cache_level - Find the level at which the last cache is * present for the given logical cpu * * @cpu: cpu number(logical index) for which the last cache level is needed * * Return: The level at which the last cache is present. It is exactly * same as the total number of cache levels for the given logical cpu. */ int of_find_last_cache_level(unsigned int cpu) { u32 cache_level = 0; struct device_node *prev = NULL, *np = of_cpu_device_node_get(cpu); while (np) { of_node_put(prev); prev = np; np = of_find_next_cache_node(np); } of_property_read_u32(prev, "cache-level", &cache_level); of_node_put(prev); return cache_level; } /** * of_map_id - Translate an ID through a downstream mapping. * @np: root complex device node. * @id: device ID to map. * @map_name: property name of the map to use. * @map_mask_name: optional property name of the mask to use. * @target: optional pointer to a target device node. * @id_out: optional pointer to receive the translated ID. * * Given a device ID, look up the appropriate implementation-defined * platform ID and/or the target device which receives transactions on that * ID, as per the "iommu-map" and "msi-map" bindings. Either of @target or * @id_out may be NULL if only the other is required. If @target points to * a non-NULL device node pointer, only entries targeting that node will be * matched; if it points to a NULL value, it will receive the device node of * the first matching target phandle, with a reference held. * * Return: 0 on success or a standard error code on failure. */ int of_map_id(struct device_node *np, u32 id, const char *map_name, const char *map_mask_name, struct device_node **target, u32 *id_out) { u32 map_mask, masked_id; int map_len; const __be32 *map = NULL; if (!np || !map_name || (!target && !id_out)) return -EINVAL; map = of_get_property(np, map_name, &map_len); if (!map) { if (target) return -ENODEV; /* Otherwise, no map implies no translation */ *id_out = id; return 0; } if (!map_len || map_len % (4 * sizeof(*map))) { pr_err("%pOF: Error: Bad %s length: %d\n", np, map_name, map_len); return -EINVAL; } /* The default is to select all bits. */ map_mask = 0xffffffff; /* * Can be overridden by "{iommu,msi}-map-mask" property. * If of_property_read_u32() fails, the default is used. */ if (map_mask_name) of_property_read_u32(np, map_mask_name, &map_mask); masked_id = map_mask & id; for ( ; map_len > 0; map_len -= 4 * sizeof(*map), map += 4) { struct device_node *phandle_node; u32 id_base = be32_to_cpup(map + 0); u32 phandle = be32_to_cpup(map + 1); u32 out_base = be32_to_cpup(map + 2); u32 id_len = be32_to_cpup(map + 3); if (id_base & ~map_mask) { pr_err("%pOF: Invalid %s translation - %s-mask (0x%x) ignores id-base (0x%x)\n", np, map_name, map_name, map_mask, id_base); return -EFAULT; } if (masked_id < id_base || masked_id >= id_base + id_len) continue; phandle_node = of_find_node_by_phandle(phandle); if (!phandle_node) return -ENODEV; if (target) { if (*target) of_node_put(phandle_node); else *target = phandle_node; if (*target != phandle_node) continue; } if (id_out) *id_out = masked_id - id_base + out_base; pr_debug("%pOF: %s, using mask %08x, id-base: %08x, out-base: %08x, length: %08x, id: %08x -> %08x\n", np, map_name, map_mask, id_base, out_base, id_len, id, masked_id - id_base + out_base); return 0; } pr_info("%pOF: no %s translation for id 0x%x on %pOF\n", np, map_name, id, target && *target ? *target : NULL); /* Bypasses translation */ if (id_out) *id_out = id; return 0; } EXPORT_SYMBOL_GPL(of_map_id);
341 112 4135 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _IPV6_H #define _IPV6_H #include <uapi/linux/ipv6.h> #define ipv6_optlen(p) (((p)->hdrlen+1) << 3) #define ipv6_authlen(p) (((p)->hdrlen+2) << 2) /* * This structure contains configuration options per IPv6 link. */ struct ipv6_devconf { __s32 forwarding; __s32 hop_limit; __s32 mtu6; __s32 accept_ra; __s32 accept_redirects; __s32 autoconf; __s32 dad_transmits; __s32 rtr_solicits; __s32 rtr_solicit_interval; __s32 rtr_solicit_max_interval; __s32 rtr_solicit_delay; __s32 force_mld_version; __s32 mldv1_unsolicited_report_interval; __s32 mldv2_unsolicited_report_interval; __s32 use_tempaddr; __s32 temp_valid_lft; __s32 temp_prefered_lft; __s32 regen_max_retry; __s32 max_desync_factor; __s32 max_addresses; __s32 accept_ra_defrtr; __u32 ra_defrtr_metric; __s32 accept_ra_min_hop_limit; __s32 accept_ra_min_lft; __s32 accept_ra_pinfo; __s32 ignore_routes_with_linkdown; #ifdef CONFIG_IPV6_ROUTER_PREF __s32 accept_ra_rtr_pref; __s32 rtr_probe_interval; #ifdef CONFIG_IPV6_ROUTE_INFO __s32 accept_ra_rt_info_min_plen; __s32 accept_ra_rt_info_max_plen; #endif #endif __s32 proxy_ndp; __s32 accept_source_route; __s32 accept_ra_from_local; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD __s32 optimistic_dad; __s32 use_optimistic; #endif #ifdef CONFIG_IPV6_MROUTE atomic_t mc_forwarding; #endif __s32 disable_ipv6; __s32 drop_unicast_in_l2_multicast; __s32 accept_dad; __s32 force_tllao; __s32 ndisc_notify; __s32 suppress_frag_ndisc; __s32 accept_ra_mtu; __s32 drop_unsolicited_na; __s32 accept_untracked_na; struct ipv6_stable_secret { bool initialized; struct in6_addr secret; } stable_secret; __s32 use_oif_addrs_only; __s32 keep_addr_on_down; __s32 seg6_enabled; #ifdef CONFIG_IPV6_SEG6_HMAC __s32 seg6_require_hmac; #endif __u32 enhanced_dad; __u32 addr_gen_mode; __s32 disable_policy; __s32 ndisc_tclass; __s32 rpl_seg_enabled; __u32 ioam6_id; __u32 ioam6_id_wide; __u8 ioam6_enabled; __u8 ndisc_evict_nocarrier; __u8 ra_honor_pio_life; struct ctl_table_header *sysctl_header; }; struct ipv6_params { __s32 disable_ipv6; __s32 autoconf; }; extern struct ipv6_params ipv6_defaults; #include <linux/tcp.h> #include <linux/udp.h> #include <net/inet_sock.h> static inline struct ipv6hdr *ipv6_hdr(const struct sk_buff *skb) { return (struct ipv6hdr *)skb_network_header(skb); } static inline struct ipv6hdr *inner_ipv6_hdr(const struct sk_buff *skb) { return (struct ipv6hdr *)skb_inner_network_header(skb); } static inline struct ipv6hdr *ipipv6_hdr(const struct sk_buff *skb) { return (struct ipv6hdr *)skb_transport_header(skb); } static inline unsigned int ipv6_transport_len(const struct sk_buff *skb) { return ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr) - skb_network_header_len(skb); } /* This structure contains results of exthdrs parsing as offsets from skb->nh. */ struct inet6_skb_parm { int iif; __be16 ra; __u16 dst0; __u16 srcrt; __u16 dst1; __u16 lastopt; __u16 nhoff; __u16 flags; #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) __u16 dsthao; #endif __u16 frag_max_size; __u16 srhoff; #define IP6SKB_XFRM_TRANSFORMED 1 #define IP6SKB_FORWARDED 2 #define IP6SKB_REROUTED 4 #define IP6SKB_ROUTERALERT 8 #define IP6SKB_FRAGMENTED 16 #define IP6SKB_HOPBYHOP 32 #define IP6SKB_L3SLAVE 64 #define IP6SKB_JUMBOGRAM 128 #define IP6SKB_SEG6 256 #define IP6SKB_FAKEJUMBO 512 #define IP6SKB_MULTIPATH 1024 }; #if defined(CONFIG_NET_L3_MASTER_DEV) static inline bool ipv6_l3mdev_skb(__u16 flags) { return flags & IP6SKB_L3SLAVE; } #else static inline bool ipv6_l3mdev_skb(__u16 flags) { return false; } #endif #define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb)) #define IP6CBMTU(skb) ((struct ip6_mtuinfo *)((skb)->cb)) static inline int inet6_iif(const struct sk_buff *skb) { bool l3_slave = ipv6_l3mdev_skb(IP6CB(skb)->flags); return l3_slave ? skb->skb_iif : IP6CB(skb)->iif; } static inline bool inet6_is_jumbogram(const struct sk_buff *skb) { return !!(IP6CB(skb)->flags & IP6SKB_JUMBOGRAM); } /* can not be used in TCP layer after tcp_v6_fill_cb */ static inline int inet6_sdif(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) return IP6CB(skb)->iif; #endif return 0; } struct tcp6_request_sock { struct tcp_request_sock tcp6rsk_tcp; }; struct ipv6_mc_socklist; struct ipv6_ac_socklist; struct ipv6_fl_socklist; struct inet6_cork { struct ipv6_txoptions *opt; u8 hop_limit; u8 tclass; }; /* struct ipv6_pinfo - ipv6 private area */ struct ipv6_pinfo { struct in6_addr saddr; struct in6_pktinfo sticky_pktinfo; const struct in6_addr *daddr_cache; #ifdef CONFIG_IPV6_SUBTREES const struct in6_addr *saddr_cache; #endif __be32 flow_label; __u32 frag_size; s16 hop_limit; u8 mcast_hops; int ucast_oif; int mcast_oif; /* pktoption flags */ union { struct { __u16 srcrt:1, osrcrt:1, rxinfo:1, rxoinfo:1, rxhlim:1, rxohlim:1, hopopts:1, ohopopts:1, dstopts:1, odstopts:1, rxflow:1, rxtclass:1, rxpmtu:1, rxorigdstaddr:1, recvfragsize:1; /* 1 bits hole */ } bits; __u16 all; } rxopt; /* sockopt flags */ __u8 srcprefs; /* 001: prefer temporary address * 010: prefer public address * 100: prefer care-of address */ __u8 pmtudisc; __u8 min_hopcount; __u8 tclass; __be32 rcv_flowinfo; __u32 dst_cookie; struct ipv6_mc_socklist __rcu *ipv6_mc_list; struct ipv6_ac_socklist *ipv6_ac_list; struct ipv6_fl_socklist __rcu *ipv6_fl_list; struct ipv6_txoptions __rcu *opt; struct sk_buff *pktoptions; struct sk_buff *rxpmtu; struct inet6_cork cork; }; /* We currently use available bits from inet_sk(sk)->inet_flags, * this could change in the future. */ #define inet6_test_bit(nr, sk) \ test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) #define inet6_set_bit(nr, sk) \ set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) #define inet6_clear_bit(nr, sk) \ clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) #define inet6_assign_bit(nr, sk, val) \ assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val) /* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */ struct raw6_sock { /* inet_sock has to be the first member of raw6_sock */ struct inet_sock inet; __u32 checksum; /* perform checksum */ __u32 offset; /* checksum offset */ struct icmp6_filter filter; __u32 ip6mr_table; struct ipv6_pinfo inet6; }; struct udp6_sock { struct udp_sock udp; struct ipv6_pinfo inet6; }; struct tcp6_sock { struct tcp_sock tcp; struct ipv6_pinfo inet6; }; extern int inet6_sk_rebuild_header(struct sock *sk); struct tcp6_timewait_sock { struct tcp_timewait_sock tcp6tw_tcp; }; #if IS_ENABLED(CONFIG_IPV6) bool ipv6_mod_enabled(void); static inline struct ipv6_pinfo *inet6_sk(const struct sock *__sk) { return sk_fullsock(__sk) ? inet_sk(__sk)->pinet6 : NULL; } #define raw6_sk(ptr) container_of_const(ptr, struct raw6_sock, inet.sk) #define ipv6_only_sock(sk) (sk->sk_ipv6only) #define ipv6_sk_rxinfo(sk) ((sk)->sk_family == PF_INET6 && \ inet6_sk(sk)->rxopt.bits.rxinfo) static inline const struct in6_addr *inet6_rcv_saddr(const struct sock *sk) { if (sk->sk_family == AF_INET6) return &sk->sk_v6_rcv_saddr; return NULL; } static inline int inet_v6_ipv6only(const struct sock *sk) { /* ipv6only field is at same position for timewait and other sockets */ return ipv6_only_sock(sk); } #else #define ipv6_only_sock(sk) 0 #define ipv6_sk_rxinfo(sk) 0 static inline bool ipv6_mod_enabled(void) { return false; } static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) { return NULL; } static inline struct raw6_sock *raw6_sk(const struct sock *sk) { return NULL; } #define inet6_rcv_saddr(__sk) NULL #define inet_v6_ipv6only(__sk) 0 #endif /* IS_ENABLED(CONFIG_IPV6) */ #endif /* _IPV6_H */
392 948 286 732 31 33 1154 1152 206 205 252 252 176 176 175 175 94 94 94 63 31 94 17 17 6 4 4 9 9 9 8 8 4 816 656 831 815 906 906 70 837 25 825 47 768 41 745 1 50 114 7 107 12 10 3 2 2 3 3 3 109 3 5 2 4 145 33 33 758 4 7 1 4 3 3 3 151 35 11 21 208 9 8 9 70 13 27 18 3 4 5 347 4 6 17 2 2 5 3 3 4 6 3 1 2 2 7 2 6 2 10 3 1 2 34 12 2 8 48 2 10 7 5 3 3 19 2 3 109 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/bpf-cgroup.h> #include <linux/cgroup.h> #include <linux/rcupdate.h> #include <linux/random.h> #include <linux/smp.h> #include <linux/topology.h> #include <linux/ktime.h> #include <linux/sched.h> #include <linux/uidgid.h> #include <linux/filter.h> #include <linux/ctype.h> #include <linux/jiffies.h> #include <linux/pid_namespace.h> #include <linux/poison.h> #include <linux/proc_ns.h> #include <linux/sched/task.h> #include <linux/security.h> #include <linux/btf_ids.h> #include <linux/bpf_mem_alloc.h> #include <linux/kasan.h> #include "../../lib/kstrtox.h" /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments * * Different map implementations will rely on rcu in map methods * lookup/update/delete, therefore eBPF programs must run under rcu lock * if program is allowed to access maps, so check rcu_read_lock_held() or * rcu_read_lock_trace_held() in all three functions. */ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); return (unsigned long) map->ops->map_lookup_elem(map, key); } const struct bpf_func_proto bpf_map_lookup_elem_proto = { .func = bpf_map_lookup_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, }; BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); return map->ops->map_update_elem(map, key, value, flags); } const struct bpf_func_proto bpf_map_update_elem_proto = { .func = bpf_map_update_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, .arg3_type = ARG_PTR_TO_MAP_VALUE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && !rcu_read_lock_bh_held()); return map->ops->map_delete_elem(map, key); } const struct bpf_func_proto bpf_map_delete_elem_proto = { .func = bpf_map_delete_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, }; BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags) { return map->ops->map_push_elem(map, value, flags); } const struct bpf_func_proto bpf_map_push_elem_proto = { .func = bpf_map_push_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_VALUE, .arg3_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) { return map->ops->map_pop_elem(map, value); } const struct bpf_func_proto bpf_map_pop_elem_proto = { .func = bpf_map_pop_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, }; BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) { return map->ops->map_peek_elem(map, value); } const struct bpf_func_proto bpf_map_peek_elem_proto = { .func = bpf_map_peek_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, }; BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) { WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); } const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto = { .func = bpf_map_lookup_percpu_elem, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, .arg3_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_get_prandom_u32_proto = { .func = bpf_user_rnd_u32, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_get_smp_processor_id) { return smp_processor_id(); } const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .func = bpf_get_smp_processor_id, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_get_numa_node_id) { return numa_node_id(); } const struct bpf_func_proto bpf_get_numa_node_id_proto = { .func = bpf_get_numa_node_id, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_ktime_get_ns) { /* NMI safe access to clock monotonic */ return ktime_get_mono_fast_ns(); } const struct bpf_func_proto bpf_ktime_get_ns_proto = { .func = bpf_ktime_get_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_ktime_get_boot_ns) { /* NMI safe access to clock boottime */ return ktime_get_boot_fast_ns(); } const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { .func = bpf_ktime_get_boot_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_ktime_get_coarse_ns) { return ktime_get_coarse_ns(); } const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = { .func = bpf_ktime_get_coarse_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_ktime_get_tai_ns) { /* NMI safe access to clock tai */ return ktime_get_tai_fast_ns(); } const struct bpf_func_proto bpf_ktime_get_tai_ns_proto = { .func = bpf_ktime_get_tai_ns, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; if (unlikely(!task)) return -EINVAL; return (u64) task->tgid << 32 | task->pid; } const struct bpf_func_proto bpf_get_current_pid_tgid_proto = { .func = bpf_get_current_pid_tgid, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_get_current_uid_gid) { struct task_struct *task = current; kuid_t uid; kgid_t gid; if (unlikely(!task)) return -EINVAL; current_uid_gid(&uid, &gid); return (u64) from_kgid(&init_user_ns, gid) << 32 | from_kuid(&init_user_ns, uid); } const struct bpf_func_proto bpf_get_current_uid_gid_proto = { .func = bpf_get_current_uid_gid, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) { struct task_struct *task = current; if (unlikely(!task)) goto err_clear; /* Verifier guarantees that size > 0 */ strscpy_pad(buf, task->comm, size); return 0; err_clear: memset(buf, 0, size); return -EINVAL; } const struct bpf_func_proto bpf_get_current_comm_proto = { .func = bpf_get_current_comm, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE, }; #if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK) static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) { arch_spinlock_t *l = (void *)lock; union { __u32 val; arch_spinlock_t lock; } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED }; compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); preempt_disable(); arch_spin_lock(l); } static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) { arch_spinlock_t *l = (void *)lock; arch_spin_unlock(l); preempt_enable(); } #else static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) { atomic_t *l = (void *)lock; BUILD_BUG_ON(sizeof(*l) != sizeof(*lock)); do { atomic_cond_read_relaxed(l, !VAL); } while (atomic_xchg(l, 1)); } static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) { atomic_t *l = (void *)lock; atomic_set_release(l, 0); } #endif static DEFINE_PER_CPU(unsigned long, irqsave_flags); static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock) { unsigned long flags; local_irq_save(flags); __bpf_spin_lock(lock); __this_cpu_write(irqsave_flags, flags); } notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) { __bpf_spin_lock_irqsave(lock); return 0; } const struct bpf_func_proto bpf_spin_lock_proto = { .func = bpf_spin_lock, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_SPIN_LOCK, .arg1_btf_id = BPF_PTR_POISON, }; static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock) { unsigned long flags; flags = __this_cpu_read(irqsave_flags); __bpf_spin_unlock(lock); local_irq_restore(flags); } notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) { __bpf_spin_unlock_irqrestore(lock); return 0; } const struct bpf_func_proto bpf_spin_unlock_proto = { .func = bpf_spin_unlock, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_SPIN_LOCK, .arg1_btf_id = BPF_PTR_POISON, }; void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src) { struct bpf_spin_lock *lock; if (lock_src) lock = src + map->record->spin_lock_off; else lock = dst + map->record->spin_lock_off; preempt_disable(); __bpf_spin_lock_irqsave(lock); copy_map_value(map, dst, src); __bpf_spin_unlock_irqrestore(lock); preempt_enable(); } BPF_CALL_0(bpf_jiffies64) { return get_jiffies_64(); } const struct bpf_func_proto bpf_jiffies64_proto = { .func = bpf_jiffies64, .gpl_only = false, .ret_type = RET_INTEGER, }; #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { struct cgroup *cgrp; u64 cgrp_id; rcu_read_lock(); cgrp = task_dfl_cgroup(current); cgrp_id = cgroup_id(cgrp); rcu_read_unlock(); return cgrp_id; } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .func = bpf_get_current_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level) { struct cgroup *cgrp; struct cgroup *ancestor; u64 cgrp_id; rcu_read_lock(); cgrp = task_dfl_cgroup(current); ancestor = cgroup_ancestor(cgrp, ancestor_level); cgrp_id = ancestor ? cgroup_id(ancestor) : 0; rcu_read_unlock(); return cgrp_id; } const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { .func = bpf_get_current_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; #endif /* CONFIG_CGROUPS */ #define BPF_STRTOX_BASE_MASK 0x1F static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags, unsigned long long *res, bool *is_negative) { unsigned int base = flags & BPF_STRTOX_BASE_MASK; const char *cur_buf = buf; size_t cur_len = buf_len; unsigned int consumed; size_t val_len; char str[64]; if (!buf || !buf_len || !res || !is_negative) return -EINVAL; if (base != 0 && base != 8 && base != 10 && base != 16) return -EINVAL; if (flags & ~BPF_STRTOX_BASE_MASK) return -EINVAL; while (cur_buf < buf + buf_len && isspace(*cur_buf)) ++cur_buf; *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-'); if (*is_negative) ++cur_buf; consumed = cur_buf - buf; cur_len -= consumed; if (!cur_len) return -EINVAL; cur_len = min(cur_len, sizeof(str) - 1); memcpy(str, cur_buf, cur_len); str[cur_len] = '\0'; cur_buf = str; cur_buf = _parse_integer_fixup_radix(cur_buf, &base); val_len = _parse_integer(cur_buf, base, res); if (val_len & KSTRTOX_OVERFLOW) return -ERANGE; if (val_len == 0) return -EINVAL; cur_buf += val_len; consumed += cur_buf - str; return consumed; } static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags, long long *res) { unsigned long long _res; bool is_negative; int err; err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); if (err < 0) return err; if (is_negative) { if ((long long)-_res > 0) return -ERANGE; *res = -_res; } else { if ((long long)_res < 0) return -ERANGE; *res = _res; } return err; } BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags, long *, res) { long long _res; int err; err = __bpf_strtoll(buf, buf_len, flags, &_res); if (err < 0) return err; if (_res != (long)_res) return -ERANGE; *res = _res; return err; } const struct bpf_func_proto bpf_strtol_proto = { .func = bpf_strtol, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, }; BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags, unsigned long *, res) { unsigned long long _res; bool is_negative; int err; err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); if (err < 0) return err; if (is_negative) return -EINVAL; if (_res != (unsigned long)_res) return -ERANGE; *res = _res; return err; } const struct bpf_func_proto bpf_strtoul_proto = { .func = bpf_strtoul, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, }; BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2) { return strncmp(s1, s2, s1_sz); } static const struct bpf_func_proto bpf_strncmp_proto = { .func = bpf_strncmp, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_PTR_TO_CONST_STR, }; BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino, struct bpf_pidns_info *, nsdata, u32, size) { struct task_struct *task = current; struct pid_namespace *pidns; int err = -EINVAL; if (unlikely(size != sizeof(struct bpf_pidns_info))) goto clear; if (unlikely((u64)(dev_t)dev != dev)) goto clear; if (unlikely(!task)) goto clear; pidns = task_active_pid_ns(task); if (unlikely(!pidns)) { err = -ENOENT; goto clear; } if (!ns_match(&pidns->ns, (dev_t)dev, ino)) goto clear; nsdata->pid = task_pid_nr_ns(task, pidns); nsdata->tgid = task_tgid_nr_ns(task, pidns); return 0; clear: memset((void *)nsdata, 0, (size_t) size); return err; } const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { .func = bpf_get_ns_current_pid_tgid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { .func = bpf_get_raw_cpu_id, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags, void *, data, u64, size) { if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; return bpf_event_output(map, flags, data, size, NULL, 0, NULL); } const struct bpf_func_proto bpf_event_output_data_proto = { .func = bpf_event_output_data, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size, const void __user *, user_ptr) { int ret = copy_from_user(dst, user_ptr, size); if (unlikely(ret)) { memset(dst, 0, size); ret = -EFAULT; } return ret; } const struct bpf_func_proto bpf_copy_from_user_proto = { .func = bpf_copy_from_user, .gpl_only = false, .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size, const void __user *, user_ptr, struct task_struct *, tsk, u64, flags) { int ret; /* flags is not used yet */ if (unlikely(flags)) return -EINVAL; if (unlikely(!size)) return 0; ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0); if (ret == size) return 0; memset(dst, 0, size); /* Return -EFAULT for partial read */ return ret < 0 ? ret : -EFAULT; } const struct bpf_func_proto bpf_copy_from_user_task_proto = { .func = bpf_copy_from_user_task, .gpl_only = true, .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_BTF_ID, .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg5_type = ARG_ANYTHING }; BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) { if (cpu >= nr_cpu_ids) return (unsigned long)NULL; return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu); } const struct bpf_func_proto bpf_per_cpu_ptr_proto = { .func = bpf_per_cpu_ptr, .gpl_only = false, .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, .arg2_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) { return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr); } const struct bpf_func_proto bpf_this_cpu_ptr_proto = { .func = bpf_this_cpu_ptr, .gpl_only = false, .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, }; static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, size_t bufsz) { void __user *user_ptr = (__force void __user *)unsafe_ptr; buf[0] = 0; switch (fmt_ptype) { case 's': #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE if ((unsigned long)unsafe_ptr < TASK_SIZE) return strncpy_from_user_nofault(buf, user_ptr, bufsz); fallthrough; #endif case 'k': return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); case 'u': return strncpy_from_user_nofault(buf, user_ptr, bufsz); } return -EINVAL; } /* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary * arguments representation. */ #define MAX_BPRINTF_BIN_ARGS 512 /* Support executing three nested bprintf helper calls on a given CPU */ #define MAX_BPRINTF_NEST_LEVEL 3 struct bpf_bprintf_buffers { char bin_args[MAX_BPRINTF_BIN_ARGS]; char buf[MAX_BPRINTF_BUF]; }; static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs); static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); static int try_get_buffers(struct bpf_bprintf_buffers **bufs) { int nest_level; preempt_disable(); nest_level = this_cpu_inc_return(bpf_bprintf_nest_level); if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) { this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); return -EBUSY; } *bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]); return 0; } void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) { if (!data->bin_args && !data->buf) return; if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); } /* * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers * * Returns a negative value if fmt is an invalid format string or 0 otherwise. * * This can be used in two ways: * - Format string verification only: when data->get_bin_args is false * - Arguments preparation: in addition to the above verification, it writes in * data->bin_args a binary representation of arguments usable by bstr_printf * where pointers from BPF have been sanitized. * * In argument preparation mode, if 0 is returned, safe temporary buffers are * allocated and bpf_bprintf_cleanup should be called to free them after use. */ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data) { bool get_buffers = (data->get_bin_args && num_args) || data->get_buf; char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end; struct bpf_bprintf_buffers *buffers = NULL; size_t sizeof_cur_arg, sizeof_cur_ip; int err, i, num_spec = 0; u64 cur_arg; char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX"; fmt_end = strnchr(fmt, fmt_size, 0); if (!fmt_end) return -EINVAL; fmt_size = fmt_end - fmt; if (get_buffers && try_get_buffers(&buffers)) return -EBUSY; if (data->get_bin_args) { if (num_args) tmp_buf = buffers->bin_args; tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS; data->bin_args = (u32 *)tmp_buf; } if (data->get_buf) data->buf = buffers->buf; for (i = 0; i < fmt_size; i++) { if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { err = -EINVAL; goto out; } if (fmt[i] != '%') continue; if (fmt[i + 1] == '%') { i++; continue; } if (num_spec >= num_args) { err = -EINVAL; goto out; } /* The string is zero-terminated so if fmt[i] != 0, we can * always access fmt[i + 1], in the worst case it will be a 0 */ i++; /* skip optional "[0 +-][num]" width formatting field */ while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || fmt[i] == ' ') i++; if (fmt[i] >= '1' && fmt[i] <= '9') { i++; while (fmt[i] >= '0' && fmt[i] <= '9') i++; } if (fmt[i] == 'p') { sizeof_cur_arg = sizeof(long); if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && fmt[i + 2] == 's') { fmt_ptype = fmt[i + 1]; i += 2; goto fmt_str; } if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' || fmt[i + 1] == 'x' || fmt[i + 1] == 's' || fmt[i + 1] == 'S') { /* just kernel pointers */ if (tmp_buf) cur_arg = raw_args[num_spec]; i++; goto nocopy_fmt; } if (fmt[i + 1] == 'B') { if (tmp_buf) { err = snprintf(tmp_buf, (tmp_buf_end - tmp_buf), "%pB", (void *)(long)raw_args[num_spec]); tmp_buf += (err + 1); } i++; num_spec++; continue; } /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') || (fmt[i + 2] != '4' && fmt[i + 2] != '6')) { err = -EINVAL; goto out; } i += 2; if (!tmp_buf) goto nocopy_fmt; sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16; if (tmp_buf_end - tmp_buf < sizeof_cur_ip) { err = -ENOSPC; goto out; } unsafe_ptr = (char *)(long)raw_args[num_spec]; err = copy_from_kernel_nofault(cur_ip, unsafe_ptr, sizeof_cur_ip); if (err < 0) memset(cur_ip, 0, sizeof_cur_ip); /* hack: bstr_printf expects IP addresses to be * pre-formatted as strings, ironically, the easiest way * to do that is to call snprintf. */ ip_spec[2] = fmt[i - 1]; ip_spec[3] = fmt[i]; err = snprintf(tmp_buf, tmp_buf_end - tmp_buf, ip_spec, &cur_ip); tmp_buf += err + 1; num_spec++; continue; } else if (fmt[i] == 's') { fmt_ptype = fmt[i]; fmt_str: if (fmt[i + 1] != 0 && !isspace(fmt[i + 1]) && !ispunct(fmt[i + 1])) { err = -EINVAL; goto out; } if (!tmp_buf) goto nocopy_fmt; if (tmp_buf_end == tmp_buf) { err = -ENOSPC; goto out; } unsafe_ptr = (char *)(long)raw_args[num_spec]; err = bpf_trace_copy_string(tmp_buf, unsafe_ptr, fmt_ptype, tmp_buf_end - tmp_buf); if (err < 0) { tmp_buf[0] = '\0'; err = 1; } tmp_buf += err; num_spec++; continue; } else if (fmt[i] == 'c') { if (!tmp_buf) goto nocopy_fmt; if (tmp_buf_end == tmp_buf) { err = -ENOSPC; goto out; } *tmp_buf = raw_args[num_spec]; tmp_buf++; num_spec++; continue; } sizeof_cur_arg = sizeof(int); if (fmt[i] == 'l') { sizeof_cur_arg = sizeof(long); i++; } if (fmt[i] == 'l') { sizeof_cur_arg = sizeof(long long); i++; } if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x' && fmt[i] != 'X') { err = -EINVAL; goto out; } if (tmp_buf) cur_arg = raw_args[num_spec]; nocopy_fmt: if (tmp_buf) { tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32)); if (tmp_buf_end - tmp_buf < sizeof_cur_arg) { err = -ENOSPC; goto out; } if (sizeof_cur_arg == 8) { *(u32 *)tmp_buf = *(u32 *)&cur_arg; *(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1); } else { *(u32 *)tmp_buf = (u32)(long)cur_arg; } tmp_buf += sizeof_cur_arg; } num_spec++; } err = 0; out: if (err) bpf_bprintf_cleanup(data); return err; } BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, const void *, args, u32, data_len) { struct bpf_bprintf_data data = { .get_bin_args = true, }; int err, num_args; if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 || (data_len && !args)) return -EINVAL; num_args = data_len / 8; /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we * can safely give an unbounded size. */ err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data); if (err < 0) return err; err = bstr_printf(str, str_size, fmt, data.bin_args); bpf_bprintf_cleanup(&data); return err + 1; } const struct bpf_func_proto bpf_snprintf_proto = { .func = bpf_snprintf, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM_OR_NULL, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_CONST_STR, .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; /* BPF map elements can contain 'struct bpf_timer'. * Such map owns all of its BPF timers. * 'struct bpf_timer' is allocated as part of map element allocation * and it's zero initialized. * That space is used to keep 'struct bpf_timer_kern'. * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and * remembers 'struct bpf_map *' pointer it's part of. * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn. * bpf_timer_start() arms the timer. * If user space reference to a map goes to zero at this point * ops->map_release_uref callback is responsible for cancelling the timers, * freeing their memory, and decrementing prog's refcnts. * bpf_timer_cancel() cancels the timer and decrements prog's refcnt. * Inner maps can contain bpf timers as well. ops->map_release_uref is * freeing the timers when inner map is replaced or deleted by user space. */ struct bpf_hrtimer { struct hrtimer timer; struct bpf_map *map; struct bpf_prog *prog; void __rcu *callback_fn; void *value; }; /* the actual struct hidden inside uapi struct bpf_timer */ struct bpf_timer_kern { struct bpf_hrtimer *timer; /* bpf_spin_lock is used here instead of spinlock_t to make * sure that it always fits into space reserved by struct bpf_timer * regardless of LOCKDEP and spinlock debug flags. */ struct bpf_spin_lock lock; } __attribute__((aligned(8))); static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) { struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); struct bpf_map *map = t->map; void *value = t->value; bpf_callback_t callback_fn; void *key; u32 idx; BTF_TYPE_EMIT(struct bpf_timer); callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); if (!callback_fn) goto out; /* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and * cannot be preempted by another bpf_timer_cb() on the same cpu. * Remember the timer this callback is servicing to prevent * deadlock if callback_fn() calls bpf_timer_cancel() or * bpf_map_delete_elem() on the same timer. */ this_cpu_write(hrtimer_running, t); if (map->map_type == BPF_MAP_TYPE_ARRAY) { struct bpf_array *array = container_of(map, struct bpf_array, map); /* compute the key */ idx = ((char *)value - array->value) / array->elem_size; key = &idx; } else { /* hash or lru */ key = value - round_up(map->key_size, 8); } callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); /* The verifier checked that return value is zero. */ this_cpu_write(hrtimer_running, NULL); out: return HRTIMER_NORESTART; } BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map, u64, flags) { clockid_t clockid = flags & (MAX_CLOCKS - 1); struct bpf_hrtimer *t; int ret = 0; BUILD_BUG_ON(MAX_CLOCKS != 16); BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer)); BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer)); if (in_nmi()) return -EOPNOTSUPP; if (flags >= MAX_CLOCKS || /* similar to timerfd except _ALARM variants are not supported */ (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME && clockid != CLOCK_BOOTTIME)) return -EINVAL; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; if (t) { ret = -EBUSY; goto out; } /* allocate hrtimer via map_kmalloc to use memcg accounting */ t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node); if (!t) { ret = -ENOMEM; goto out; } t->value = (void *)timer - map->record->timer_off; t->map = map; t->prog = NULL; rcu_assign_pointer(t->callback_fn, NULL); hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); t->timer.function = bpf_timer_cb; WRITE_ONCE(timer->timer, t); /* Guarantee the order between timer->timer and map->usercnt. So * when there are concurrent uref release and bpf timer init, either * bpf_timer_cancel_and_free() called by uref release reads a no-NULL * timer or atomic64_read() below returns a zero usercnt. */ smp_mb(); if (!atomic64_read(&map->usercnt)) { /* maps with timers must be either held by user space * or pinned in bpffs. */ WRITE_ONCE(timer->timer, NULL); kfree(t); ret = -EPERM; } out: __bpf_spin_unlock_irqrestore(&timer->lock); return ret; } static const struct bpf_func_proto bpf_timer_init_proto = { .func = bpf_timer_init, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_TIMER, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn, struct bpf_prog_aux *, aux) { struct bpf_prog *prev, *prog = aux->prog; struct bpf_hrtimer *t; int ret = 0; if (in_nmi()) return -EOPNOTSUPP; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; if (!t) { ret = -EINVAL; goto out; } if (!atomic64_read(&t->map->usercnt)) { /* maps with timers must be either held by user space * or pinned in bpffs. Otherwise timer might still be * running even when bpf prog is detached and user space * is gone, since map_release_uref won't ever be called. */ ret = -EPERM; goto out; } prev = t->prog; if (prev != prog) { /* Bump prog refcnt once. Every bpf_timer_set_callback() * can pick different callback_fn-s within the same prog. */ prog = bpf_prog_inc_not_zero(prog); if (IS_ERR(prog)) { ret = PTR_ERR(prog); goto out; } if (prev) /* Drop prev prog refcnt when swapping with new prog */ bpf_prog_put(prev); t->prog = prog; } rcu_assign_pointer(t->callback_fn, callback_fn); out: __bpf_spin_unlock_irqrestore(&timer->lock); return ret; } static const struct bpf_func_proto bpf_timer_set_callback_proto = { .func = bpf_timer_set_callback, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_TIMER, .arg2_type = ARG_PTR_TO_FUNC, }; BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; int ret = 0; enum hrtimer_mode mode; if (in_nmi()) return -EOPNOTSUPP; if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN)) return -EINVAL; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; if (!t || !t->prog) { ret = -EINVAL; goto out; } if (flags & BPF_F_TIMER_ABS) mode = HRTIMER_MODE_ABS_SOFT; else mode = HRTIMER_MODE_REL_SOFT; if (flags & BPF_F_TIMER_CPU_PIN) mode |= HRTIMER_MODE_PINNED; hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); out: __bpf_spin_unlock_irqrestore(&timer->lock); return ret; } static const struct bpf_func_proto bpf_timer_start_proto = { .func = bpf_timer_start, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_TIMER, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static void drop_prog_refcnt(struct bpf_hrtimer *t) { struct bpf_prog *prog = t->prog; if (prog) { bpf_prog_put(prog); t->prog = NULL; rcu_assign_pointer(t->callback_fn, NULL); } } BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer) { struct bpf_hrtimer *t; int ret = 0; if (in_nmi()) return -EOPNOTSUPP; __bpf_spin_lock_irqsave(&timer->lock); t = timer->timer; if (!t) { ret = -EINVAL; goto out; } if (this_cpu_read(hrtimer_running) == t) { /* If bpf callback_fn is trying to bpf_timer_cancel() * its own timer the hrtimer_cancel() will deadlock * since it waits for callback_fn to finish */ ret = -EDEADLK; goto out; } drop_prog_refcnt(t); out: __bpf_spin_unlock_irqrestore(&timer->lock); /* Cancel the timer and wait for associated callback to finish * if it was running. */ ret = ret ?: hrtimer_cancel(&t->timer); return ret; } static const struct bpf_func_proto bpf_timer_cancel_proto = { .func = bpf_timer_cancel, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_TIMER, }; /* This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ void bpf_timer_cancel_and_free(void *val) { struct bpf_timer_kern *timer = val; struct bpf_hrtimer *t; /* Performance optimization: read timer->timer without lock first. */ if (!READ_ONCE(timer->timer)) return; __bpf_spin_lock_irqsave(&timer->lock); /* re-read it under lock */ t = timer->timer; if (!t) goto out; drop_prog_refcnt(t); /* The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ WRITE_ONCE(timer->timer, NULL); out: __bpf_spin_unlock_irqrestore(&timer->lock); if (!t) return; /* Cancel the timer and wait for callback to complete if it was running. * If hrtimer_cancel() can be safely called it's safe to call kfree(t) * right after for both preallocated and non-preallocated maps. * The timer->timer = NULL was already done and no code path can * see address 't' anymore. * * Check that bpf_map_delete/update_elem() wasn't called from timer * callback_fn. In such case don't call hrtimer_cancel() (since it will * deadlock) and don't call hrtimer_try_to_cancel() (since it will just * return -1). Though callback_fn is still running on this cpu it's * safe to do kfree(t) because bpf_timer_cb() read everything it needed * from 't'. The bpf subprog callback_fn won't be able to access 't', * since timer->timer = NULL was already done. The timer will be * effectively cancelled because bpf_timer_cb() will return * HRTIMER_NORESTART. */ if (this_cpu_read(hrtimer_running) != t) hrtimer_cancel(&t->timer); kfree(t); } BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr) { unsigned long *kptr = map_value; return xchg(kptr, (unsigned long)ptr); } /* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg() * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to * denote type that verifier will determine. */ static const struct bpf_func_proto bpf_kptr_xchg_proto = { .func = bpf_kptr_xchg, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .ret_btf_id = BPF_PTR_POISON, .arg1_type = ARG_PTR_TO_KPTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE, .arg2_btf_id = BPF_PTR_POISON, }; /* Since the upper 8 bits of dynptr->size is reserved, the * maximum supported size is 2^24 - 1. */ #define DYNPTR_MAX_SIZE ((1UL << 24) - 1) #define DYNPTR_TYPE_SHIFT 28 #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) { ptr->size |= DYNPTR_RDONLY_BIT; } static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type) { ptr->size |= type << DYNPTR_TYPE_SHIFT; } static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr) { return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT; } u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; } static void bpf_dynptr_set_size(struct bpf_dynptr_kern *ptr, u32 new_size) { u32 metadata = ptr->size & ~DYNPTR_SIZE_MASK; ptr->size = new_size | metadata; } int bpf_dynptr_check_size(u32 size) { return size > DYNPTR_MAX_SIZE ? -E2BIG : 0; } void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size) { ptr->data = data; ptr->offset = offset; ptr->size = size; bpf_dynptr_set_type(ptr, type); } void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) { memset(ptr, 0, sizeof(*ptr)); } static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) { u32 size = __bpf_dynptr_size(ptr); if (len > size || offset > size - len) return -E2BIG; return 0; } BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) { int err; BTF_TYPE_EMIT(struct bpf_dynptr); err = bpf_dynptr_check_size(size); if (err) goto error; /* flags is currently unsupported */ if (flags) { err = -EINVAL; goto error; } bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_LOCAL, 0, size); return 0; error: bpf_dynptr_set_null(ptr); return err; } static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .func = bpf_dynptr_from_mem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, }; BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, u32, offset, u64, flags) { enum bpf_dynptr_type type; int err; if (!src->data || flags) return -EINVAL; err = bpf_dynptr_check_off_len(src, offset, len); if (err) return err; type = bpf_dynptr_get_type(src); switch (type) { case BPF_DYNPTR_TYPE_LOCAL: case BPF_DYNPTR_TYPE_RINGBUF: /* Source and destination may possibly overlap, hence use memmove to * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr * pointing to overlapping PTR_TO_MAP_VALUE regions. */ memmove(dst, src->data + src->offset + offset, len); return 0; case BPF_DYNPTR_TYPE_SKB: return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_XDP: return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; } } static const struct bpf_func_proto bpf_dynptr_read_proto = { .func = bpf_dynptr_read, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len, u64, flags) { enum bpf_dynptr_type type; int err; if (!dst->data || __bpf_dynptr_is_rdonly(dst)) return -EINVAL; err = bpf_dynptr_check_off_len(dst, offset, len); if (err) return err; type = bpf_dynptr_get_type(dst); switch (type) { case BPF_DYNPTR_TYPE_LOCAL: case BPF_DYNPTR_TYPE_RINGBUF: if (flags) return -EINVAL; /* Source and destination may possibly overlap, hence use memmove to * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr * pointing to overlapping PTR_TO_MAP_VALUE regions. */ memmove(dst->data + dst->offset + offset, src, len); return 0; case BPF_DYNPTR_TYPE_SKB: return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len, flags); case BPF_DYNPTR_TYPE_XDP: if (flags) return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; } } static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) { enum bpf_dynptr_type type; int err; if (!ptr->data) return 0; err = bpf_dynptr_check_off_len(ptr, offset, len); if (err) return 0; if (__bpf_dynptr_is_rdonly(ptr)) return 0; type = bpf_dynptr_get_type(ptr); switch (type) { case BPF_DYNPTR_TYPE_LOCAL: case BPF_DYNPTR_TYPE_RINGBUF: return (unsigned long)(ptr->data + ptr->offset + offset); case BPF_DYNPTR_TYPE_SKB: case BPF_DYNPTR_TYPE_XDP: /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */ return 0; default: WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type); return 0; } } static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, }; const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_get_current_task_btf_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_task_pt_regs_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_map_lookup_elem: return &bpf_map_lookup_elem_proto; case BPF_FUNC_map_update_elem: return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; case BPF_FUNC_map_push_elem: return &bpf_map_push_elem_proto; case BPF_FUNC_map_pop_elem: return &bpf_map_pop_elem_proto; case BPF_FUNC_map_peek_elem: return &bpf_map_peek_elem_proto; case BPF_FUNC_map_lookup_percpu_elem: return &bpf_map_lookup_percpu_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_raw_smp_processor_id_proto; case BPF_FUNC_get_numa_node_id: return &bpf_get_numa_node_id_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_ktime_get_tai_ns: return &bpf_ktime_get_tai_ns_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: return &bpf_ringbuf_reserve_proto; case BPF_FUNC_ringbuf_submit: return &bpf_ringbuf_submit_proto; case BPF_FUNC_ringbuf_discard: return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; case BPF_FUNC_strncmp: return &bpf_strncmp_proto; case BPF_FUNC_strtol: return &bpf_strtol_proto; case BPF_FUNC_strtoul: return &bpf_strtoul_proto; default: break; } if (!bpf_capable()) return NULL; switch (func_id) { case BPF_FUNC_spin_lock: return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; case BPF_FUNC_per_cpu_ptr: return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; case BPF_FUNC_timer_init: return &bpf_timer_init_proto; case BPF_FUNC_timer_set_callback: return &bpf_timer_set_callback_proto; case BPF_FUNC_timer_start: return &bpf_timer_start_proto; case BPF_FUNC_timer_cancel: return &bpf_timer_cancel_proto; case BPF_FUNC_kptr_xchg: return &bpf_kptr_xchg_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; case BPF_FUNC_loop: return &bpf_loop_proto; case BPF_FUNC_user_ringbuf_drain: return &bpf_user_ringbuf_drain_proto; case BPF_FUNC_ringbuf_reserve_dynptr: return &bpf_ringbuf_reserve_dynptr_proto; case BPF_FUNC_ringbuf_submit_dynptr: return &bpf_ringbuf_submit_dynptr_proto; case BPF_FUNC_ringbuf_discard_dynptr: return &bpf_ringbuf_discard_dynptr_proto; case BPF_FUNC_dynptr_from_mem: return &bpf_dynptr_from_mem_proto; case BPF_FUNC_dynptr_read: return &bpf_dynptr_read_proto; case BPF_FUNC_dynptr_write: return &bpf_dynptr_write_proto; case BPF_FUNC_dynptr_data: return &bpf_dynptr_data_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_cgrp_storage_get: return &bpf_cgrp_storage_get_proto; case BPF_FUNC_cgrp_storage_delete: return &bpf_cgrp_storage_delete_proto; case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_get_current_ancestor_cgroup_id: return &bpf_get_current_ancestor_cgroup_id_proto; #endif default: break; } if (!perfmon_capable()) return NULL; switch (func_id) { case BPF_FUNC_trace_printk: return bpf_get_trace_printk_proto(); case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: return &bpf_get_current_task_btf_proto; case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_str_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_snprintf: return &bpf_snprintf_proto; case BPF_FUNC_task_pt_regs: return &bpf_task_pt_regs_proto; case BPF_FUNC_trace_vprintk: return bpf_get_trace_vprintk_proto(); default: return NULL; } } void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock) { struct list_head *head = list_head, *orig_head = list_head; BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head)); BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head)); /* Do the actual list draining outside the lock to not hold the lock for * too long, and also prevent deadlocks if tracing programs end up * executing on entry/exit of functions called inside the critical * section, and end up doing map ops that call bpf_list_head_free for * the same map value again. */ __bpf_spin_lock_irqsave(spin_lock); if (!head->next || list_empty(head)) goto unlock; head = head->next; unlock: INIT_LIST_HEAD(orig_head); __bpf_spin_unlock_irqrestore(spin_lock); while (head != orig_head) { void *obj = head; obj -= field->graph_root.node_offset; head = head->next; /* The contained type can also have resources, including a * bpf_list_head which needs to be freed. */ migrate_disable(); __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); migrate_enable(); } } /* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are * 'rb_node *', so field name of rb_node within containing struct is not * needed. * * Since bpf_rb_tree's node type has a corresponding struct btf_field with * graph_root.node_offset, it's not necessary to know field name * or type of node struct */ #define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \ for (pos = rb_first_postorder(root); \ pos && ({ n = rb_next_postorder(pos); 1; }); \ pos = n) void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock) { struct rb_root_cached orig_root, *root = rb_root; struct rb_node *pos, *n; void *obj; BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root)); BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root)); __bpf_spin_lock_irqsave(spin_lock); orig_root = *root; *root = RB_ROOT_CACHED; __bpf_spin_unlock_irqrestore(spin_lock); bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) { obj = pos; obj -= field->graph_root.node_offset; migrate_disable(); __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); migrate_enable(); } } __bpf_kfunc_start_defs(); __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) { struct btf_struct_meta *meta = meta__ign; u64 size = local_type_id__k; void *p; p = bpf_mem_alloc(&bpf_global_ma, size); if (!p) return NULL; if (meta) bpf_obj_init(meta->record, p); return p; } __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) { u64 size = local_type_id__k; /* The verifier has ensured that meta__ign must be NULL */ return bpf_mem_alloc(&bpf_global_percpu_ma, size); } /* Must be called under migrate_disable(), as required by bpf_mem_free */ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu) { struct bpf_mem_alloc *ma; if (rec && rec->refcount_off >= 0 && !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) { /* Object is refcounted and refcount_dec didn't result in 0 * refcount. Return without freeing the object */ return; } if (rec) bpf_obj_free_fields(rec, p); if (percpu) ma = &bpf_global_percpu_ma; else ma = &bpf_global_ma; bpf_mem_free_rcu(ma, p); } __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) { struct btf_struct_meta *meta = meta__ign; void *p = p__alloc; __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false); } __bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign) { /* The verifier has ensured that meta__ign must be NULL */ bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc); } __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign) { struct btf_struct_meta *meta = meta__ign; struct bpf_refcount *ref; /* Could just cast directly to refcount_t *, but need some code using * bpf_refcount type so that it is emitted in vmlinux BTF */ ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off); if (!refcount_inc_not_zero((refcount_t *)ref)) return NULL; /* Verifier strips KF_RET_NULL if input is owned ref, see is_kfunc_ret_null * in verifier.c */ return (void *)p__refcounted_kptr; } static int __bpf_list_add(struct bpf_list_node_kern *node, struct bpf_list_head *head, bool tail, struct btf_record *rec, u64 off) { struct list_head *n = &node->list_head, *h = (void *)head; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here */ if (unlikely(!h->next)) INIT_LIST_HEAD(h); /* node->owner != NULL implies !list_empty(n), no need to separately * check the latter */ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ __bpf_obj_drop_impl((void *)n - off, rec, false); return -EINVAL; } tail ? list_add_tail(n, h) : list_add(n, h); WRITE_ONCE(node->owner, head); return 0; } __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head, struct bpf_list_node *node, void *meta__ign, u64 off) { struct bpf_list_node_kern *n = (void *)node; struct btf_struct_meta *meta = meta__ign; return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, struct bpf_list_node *node, void *meta__ign, u64 off) { struct bpf_list_node_kern *n = (void *)node; struct btf_struct_meta *meta = meta__ign; return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off); } static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) { struct list_head *n, *h = (void *)head; struct bpf_list_node_kern *node; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here */ if (unlikely(!h->next)) INIT_LIST_HEAD(h); if (list_empty(h)) return NULL; n = tail ? h->prev : h->next; node = container_of(n, struct bpf_list_node_kern, list_head); if (WARN_ON_ONCE(READ_ONCE(node->owner) != head)) return NULL; list_del_init(n); WRITE_ONCE(node->owner, NULL); return (struct bpf_list_node *)n; } __bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) { return __bpf_list_del(head, false); } __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) { return __bpf_list_del(head, true); } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; struct rb_root_cached *r = (struct rb_root_cached *)root; struct rb_node *n = &node_internal->rb_node; /* node_internal->owner != root implies either RB_EMPTY_NODE(n) or * n is owned by some other tree. No need to check RB_EMPTY_NODE(n) */ if (READ_ONCE(node_internal->owner) != root) return NULL; rb_erase_cached(n, r); RB_CLEAR_NODE(n); WRITE_ONCE(node_internal->owner, NULL); return (struct bpf_rb_node *)n; } /* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF * program */ static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node_kern *node, void *less, struct btf_record *rec, u64 off) { struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node; struct rb_node *parent = NULL, *n = &node->rb_node; bpf_callback_t cb = (bpf_callback_t)less; bool leftmost = true; /* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately * check the latter */ if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ __bpf_obj_drop_impl((void *)n - off, rec, false); return -EINVAL; } while (*link) { parent = *link; if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(n, parent, link); rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost); WRITE_ONCE(node->owner, root); return 0; } __bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), void *meta__ign, u64 off) { struct btf_struct_meta *meta = meta__ign; struct bpf_rb_node_kern *n = (void *)node; return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off); } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) { struct rb_root_cached *r = (struct rb_root_cached *)root; return (struct bpf_rb_node *)rb_first_cached(r); } /** * bpf_task_acquire - Acquire a reference to a task. A task acquired by this * kfunc which is not stored in a map as a kptr, must be released by calling * bpf_task_release(). * @p: The task on which a reference is being acquired. */ __bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p) { if (refcount_inc_not_zero(&p->rcu_users)) return p; return NULL; } /** * bpf_task_release - Release the reference acquired on a task. * @p: The task on which a reference is being released. */ __bpf_kfunc void bpf_task_release(struct task_struct *p) { put_task_struct_rcu_user(p); } __bpf_kfunc void bpf_task_release_dtor(void *p) { put_task_struct_rcu_user(p); } CFI_NOSEAL(bpf_task_release_dtor); #ifdef CONFIG_CGROUPS /** * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by * this kfunc which is not stored in a map as a kptr, must be released by * calling bpf_cgroup_release(). * @cgrp: The cgroup on which a reference is being acquired. */ __bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) { return cgroup_tryget(cgrp) ? cgrp : NULL; } /** * bpf_cgroup_release - Release the reference acquired on a cgroup. * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to * not be freed until the current grace period has ended, even if its refcount * drops to 0. * @cgrp: The cgroup on which a reference is being released. */ __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp) { cgroup_put(cgrp); } __bpf_kfunc void bpf_cgroup_release_dtor(void *cgrp) { cgroup_put(cgrp); } CFI_NOSEAL(bpf_cgroup_release_dtor); /** * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor * array. A cgroup returned by this kfunc which is not subsequently stored in a * map, must be released by calling bpf_cgroup_release(). * @cgrp: The cgroup for which we're performing a lookup. * @level: The level of ancestor to look up. */ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) { struct cgroup *ancestor; if (level > cgrp->level || level < 0) return NULL; /* cgrp's refcnt could be 0 here, but ancestors can still be accessed */ ancestor = cgrp->ancestors[level]; if (!cgroup_tryget(ancestor)) return NULL; return ancestor; } /** * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this * kfunc which is not subsequently stored in a map, must be released by calling * bpf_cgroup_release(). * @cgid: cgroup id. */ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) { struct cgroup *cgrp; cgrp = cgroup_get_from_id(cgid); if (IS_ERR(cgrp)) return NULL; return cgrp; } /** * bpf_task_under_cgroup - wrap task_under_cgroup_hierarchy() as a kfunc, test * task's membership of cgroup ancestry. * @task: the task to be tested * @ancestor: possible ancestor of @task's cgroup * * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. * It follows all the same rules as cgroup_is_descendant, and only applies * to the default hierarchy. */ __bpf_kfunc long bpf_task_under_cgroup(struct task_struct *task, struct cgroup *ancestor) { long ret; rcu_read_lock(); ret = task_under_cgroup_hierarchy(task, ancestor); rcu_read_unlock(); return ret; } /** * bpf_task_get_cgroup1 - Acquires the associated cgroup of a task within a * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its * hierarchy ID. * @task: The target task * @hierarchy_id: The ID of a cgroup1 hierarchy * * On success, the cgroup is returen. On failure, NULL is returned. */ __bpf_kfunc struct cgroup * bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id) { struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id); if (IS_ERR(cgrp)) return NULL; return cgrp; } #endif /* CONFIG_CGROUPS */ /** * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up * in the root pid namespace idr. If a task is returned, it must either be * stored in a map, or released with bpf_task_release(). * @pid: The pid of the task being looked up. */ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) { struct task_struct *p; rcu_read_lock(); p = find_task_by_pid_ns(pid, &init_pid_ns); if (p) p = bpf_task_acquire(p); rcu_read_unlock(); return p; } /** * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. * @ptr: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr * @buffer__opt: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * * If buffer__opt is NULL, the call will fail if buffer_opt was needed. * * If the intention is to write to the data slice, please use * bpf_dynptr_slice_rdwr. * * The user must check that the returned pointer is not null before using it. * * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice * does not change the underlying packet data pointers, so a call to * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in * the bpf program. * * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only * data slice (can be either direct pointer to the data or a pointer to the user * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset, void *buffer__opt, u32 buffer__szk) { enum bpf_dynptr_type type; u32 len = buffer__szk; int err; if (!ptr->data) return NULL; err = bpf_dynptr_check_off_len(ptr, offset, len); if (err) return NULL; type = bpf_dynptr_get_type(ptr); switch (type) { case BPF_DYNPTR_TYPE_LOCAL: case BPF_DYNPTR_TYPE_RINGBUF: return ptr->data + ptr->offset + offset; case BPF_DYNPTR_TYPE_SKB: if (buffer__opt) return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt); else return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len); case BPF_DYNPTR_TYPE_XDP: { void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len); if (!IS_ERR_OR_NULL(xdp_ptr)) return xdp_ptr; if (!buffer__opt) return NULL; bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false); return buffer__opt; } default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; } } /** * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data. * @ptr: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr * @buffer__opt: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * * If buffer__opt is NULL, the call will fail if buffer_opt was needed. * * The returned pointer is writable and may point to either directly the dynptr * data at the requested offset or to the buffer if unable to obtain a direct * data pointer to (example: the requested slice is to the paged area of an skb * packet). In the case where the returned pointer is to the buffer, the user * is responsible for persisting writes through calling bpf_dynptr_write(). This * usually looks something like this pattern: * * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer)); * if (!eth) * return TC_ACT_SHOT; * * // mutate eth header // * * if (eth == buffer) * bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0); * * Please note that, as in the example above, the user must check that the * returned pointer is not null before using it. * * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr * does not change the underlying packet data pointers, so a call to * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in * the bpf program. * * Return: NULL if the call failed (eg invalid dynptr), pointer to a * data slice (can be either direct pointer to the data or a pointer to the user * provided buffer, with its contents containing the data, if unable to obtain * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset, void *buffer__opt, u32 buffer__szk) { if (!ptr->data || __bpf_dynptr_is_rdonly(ptr)) return NULL; /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice. * * For skb-type dynptrs, it is safe to write into the returned pointer * if the bpf program allows skb data writes. There are two possiblities * that may occur when calling bpf_dynptr_slice_rdwr: * * 1) The requested slice is in the head of the skb. In this case, the * returned pointer is directly to skb data, and if the skb is cloned, the * verifier will have uncloned it (see bpf_unclone_prologue()) already. * The pointer can be directly written into. * * 2) Some portion of the requested slice is in the paged buffer area. * In this case, the requested data will be copied out into the buffer * and the returned pointer will be a pointer to the buffer. The skb * will not be pulled. To persist the write, the user will need to call * bpf_dynptr_write(), which will pull the skb and commit the write. * * Similarly for xdp programs, if the requested slice is not across xdp * fragments, then a direct pointer will be returned, otherwise the data * will be copied out into the buffer and the user will need to call * bpf_dynptr_write() to commit changes. */ return bpf_dynptr_slice(ptr, offset, buffer__opt, buffer__szk); } __bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end) { u32 size; if (!ptr->data || start > end) return -EINVAL; size = __bpf_dynptr_size(ptr); if (start > size || end > size) return -ERANGE; ptr->offset += start; bpf_dynptr_set_size(ptr, end - start); return 0; } __bpf_kfunc bool bpf_dynptr_is_null(struct bpf_dynptr_kern *ptr) { return !ptr->data; } __bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) { if (!ptr->data) return false; return __bpf_dynptr_is_rdonly(ptr); } __bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr) { if (!ptr->data) return -EINVAL; return __bpf_dynptr_size(ptr); } __bpf_kfunc int bpf_dynptr_clone(struct bpf_dynptr_kern *ptr, struct bpf_dynptr_kern *clone__uninit) { if (!ptr->data) { bpf_dynptr_set_null(clone__uninit); return -EINVAL; } *clone__uninit = *ptr; return 0; } __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; } __bpf_kfunc void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k) { return obj__ign; } __bpf_kfunc void bpf_rcu_read_lock(void) { rcu_read_lock(); } __bpf_kfunc void bpf_rcu_read_unlock(void) { rcu_read_unlock(); } struct bpf_throw_ctx { struct bpf_prog_aux *aux; u64 sp; u64 bp; int cnt; }; static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp) { struct bpf_throw_ctx *ctx = cookie; struct bpf_prog *prog; if (!is_bpf_text_address(ip)) return !ctx->cnt; prog = bpf_prog_ksym_find(ip); ctx->cnt++; if (bpf_is_subprog(prog)) return true; ctx->aux = prog->aux; ctx->sp = sp; ctx->bp = bp; return false; } __bpf_kfunc void bpf_throw(u64 cookie) { struct bpf_throw_ctx ctx = {}; arch_bpf_stack_walk(bpf_stack_walker, &ctx); WARN_ON_ONCE(!ctx.aux); if (ctx.aux) WARN_ON_ONCE(!ctx.aux->exception_boundary); WARN_ON_ONCE(!ctx.bp); WARN_ON_ONCE(!ctx.cnt); /* Prevent KASAN false positives for CONFIG_KASAN_STACK by unpoisoning * deeper stack depths than ctx.sp as we do not return from bpf_throw, * which skips compiler generated instrumentation to do the same. */ kasan_unpoison_task_stack_below((void *)(long)ctx.sp); ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0); WARN(1, "A call to BPF exception callback should never return\n"); } __bpf_kfunc_end_defs(); BTF_SET8_START(generic_btf_ids) #ifdef CONFIG_KEXEC_CORE BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE) BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_add_impl) BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU) BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) BTF_SET8_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { .owner = THIS_MODULE, .set = &generic_btf_ids, }; BTF_ID_LIST(generic_dtor_ids) BTF_ID(struct, task_struct) BTF_ID(func, bpf_task_release_dtor) #ifdef CONFIG_CGROUPS BTF_ID(struct, cgroup) BTF_ID(func, bpf_cgroup_release_dtor) #endif BTF_SET8_START(common_btf_ids) BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx) BTF_ID_FLAGS(func, bpf_rdonly_cast) BTF_ID_FLAGS(func, bpf_rcu_read_lock) BTF_ID_FLAGS(func, bpf_rcu_read_unlock) BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU) BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY) #endif BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_SET8_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { .owner = THIS_MODULE, .set = &common_btf_ids, }; static int __init kfunc_init(void) { int ret; const struct btf_id_dtor_kfunc generic_dtors[] = { { .btf_id = generic_dtor_ids[0], .kfunc_btf_id = generic_dtor_ids[1] }, #ifdef CONFIG_CGROUPS { .btf_id = generic_dtor_ids[2], .kfunc_btf_id = generic_dtor_ids[3] }, #endif }; ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_C