| 6 6 6 3 1 1 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2015 Patrick McHardy <kaber@trash.net> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> struct nft_dynset { struct nft_set *set; struct nft_set_ext_tmpl tmpl; enum nft_dynset_ops op:8; u8 sreg_key; u8 sreg_data; bool invert; bool expr; u8 num_exprs; u64 timeout; struct nft_expr *expr_array[NFT_SET_EXPR_MAX]; struct nft_set_binding binding; }; static int nft_dynset_expr_setup(const struct nft_dynset *priv, const struct nft_set_ext *ext) { struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext); struct nft_expr *expr; int i; for (i = 0; i < priv->num_exprs; i++) { expr = nft_setelem_expr_at(elem_expr, elem_expr->size); if (nft_expr_clone(expr, priv->expr_array[i], GFP_ATOMIC) < 0) return -1; elem_expr->size += priv->expr_array[i]->ops->size; } return 0; } static struct nft_elem_priv *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, struct nft_regs *regs) { const struct nft_dynset *priv = nft_expr_priv(expr); struct nft_set_ext *ext; void *elem_priv; u64 timeout; if (!atomic_add_unless(&set->nelems, 1, set->size)) return NULL; timeout = priv->timeout ? : READ_ONCE(set->timeout); elem_priv = nft_set_elem_init(set, &priv->tmpl, ®s->data[priv->sreg_key], NULL, ®s->data[priv->sreg_data], timeout, 0, GFP_ATOMIC); if (IS_ERR(elem_priv)) goto err1; ext = nft_set_elem_ext(set, elem_priv); if (priv->num_exprs && nft_dynset_expr_setup(priv, ext) < 0) goto err2; return elem_priv; err2: nft_set_elem_destroy(set, elem_priv, false); err1: if (set->size) atomic_dec(&set->nelems); return NULL; } void nft_dynset_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_dynset *priv = nft_expr_priv(expr); struct nft_set *set = priv->set; const struct nft_set_ext *ext; u64 timeout; if (priv->op == NFT_DYNSET_OP_DELETE) { set->ops->delete(set, ®s->data[priv->sreg_key]); return; } if (set->ops->update(set, ®s->data[priv->sreg_key], nft_dynset_new, expr, regs, &ext)) { if (priv->op == NFT_DYNSET_OP_UPDATE && nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && READ_ONCE(nft_set_ext_timeout(ext)->timeout) != 0) { timeout = priv->timeout ? : READ_ONCE(set->timeout); WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + timeout); } nft_set_elem_update_expr(ext, regs, pkt); if (priv->invert) regs->verdict.code = NFT_BREAK; return; } if (!priv->invert) regs->verdict.code = NFT_BREAK; } static void nft_dynset_ext_add_expr(struct nft_dynset *priv) { u8 size = 0; int i; for (i = 0; i < priv->num_exprs; i++) size += priv->expr_array[i]->ops->size; nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPRESSIONS, sizeof(struct nft_set_elem_expr) + size); } static struct nft_expr * nft_dynset_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr, int pos) { struct nft_expr *expr; int err; expr = nft_set_elem_expr_alloc(ctx, set, attr); if (IS_ERR(expr)) return expr; if (set->exprs[pos] && set->exprs[pos]->ops != expr->ops) { err = -EOPNOTSUPP; goto err_dynset_expr; } return expr; err_dynset_expr: nft_expr_destroy(ctx, expr); return ERR_PTR(err); } static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { [NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING, .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_DYNSET_SET_ID] = { .type = NLA_U32 }, [NFTA_DYNSET_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 }, [NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 }, [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 }, [NFTA_DYNSET_EXPR] = { .type = NLA_NESTED }, [NFTA_DYNSET_FLAGS] = { .type = NLA_U32 }, [NFTA_DYNSET_EXPRESSIONS] = { .type = NLA_NESTED }, }; static int nft_dynset_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct nft_dynset *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; u64 timeout; int err, i; lockdep_assert_held(&nft_net->commit_mutex); if (tb[NFTA_DYNSET_SET_NAME] == NULL || tb[NFTA_DYNSET_OP] == NULL || tb[NFTA_DYNSET_SREG_KEY] == NULL) return -EINVAL; if (tb[NFTA_DYNSET_FLAGS]) { u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS])); if (flags & ~(NFT_DYNSET_F_INV | NFT_DYNSET_F_EXPR)) return -EOPNOTSUPP; if (flags & NFT_DYNSET_F_INV) priv->invert = true; if (flags & NFT_DYNSET_F_EXPR) priv->expr = true; } set = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_DYNSET_SET_NAME], tb[NFTA_DYNSET_SET_ID], genmask); if (IS_ERR(set)) return PTR_ERR(set); if (set->flags & NFT_SET_OBJECT) return -EOPNOTSUPP; if (set->ops->update == NULL) return -EOPNOTSUPP; if (set->flags & NFT_SET_CONSTANT) return -EBUSY; priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP])); if (priv->op > NFT_DYNSET_OP_DELETE) return -EOPNOTSUPP; timeout = 0; if (tb[NFTA_DYNSET_TIMEOUT] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) return -EOPNOTSUPP; err = nf_msecs_to_jiffies64(tb[NFTA_DYNSET_TIMEOUT], &timeout); if (err) return err; } err = nft_parse_register_load(ctx, tb[NFTA_DYNSET_SREG_KEY], &priv->sreg_key, set->klen); if (err < 0) return err; if (tb[NFTA_DYNSET_SREG_DATA] != NULL) { if (!(set->flags & NFT_SET_MAP)) return -EOPNOTSUPP; if (set->dtype == NFT_DATA_VERDICT) return -EOPNOTSUPP; err = nft_parse_register_load(ctx, tb[NFTA_DYNSET_SREG_DATA], &priv->sreg_data, set->dlen); if (err < 0) return err; } else if (set->flags & NFT_SET_MAP) return -EINVAL; if ((tb[NFTA_DYNSET_EXPR] || tb[NFTA_DYNSET_EXPRESSIONS]) && !(set->flags & NFT_SET_EVAL)) return -EINVAL; if (tb[NFTA_DYNSET_EXPR]) { struct nft_expr *dynset_expr; dynset_expr = nft_dynset_expr_alloc(ctx, set, tb[NFTA_DYNSET_EXPR], 0); if (IS_ERR(dynset_expr)) return PTR_ERR(dynset_expr); priv->num_exprs++; priv->expr_array[0] = dynset_expr; if (set->num_exprs > 1 || (set->num_exprs == 1 && dynset_expr->ops != set->exprs[0]->ops)) { err = -EOPNOTSUPP; goto err_expr_free; } } else if (tb[NFTA_DYNSET_EXPRESSIONS]) { struct nft_expr *dynset_expr; struct nlattr *tmp; int left; if (!priv->expr) return -EINVAL; i = 0; nla_for_each_nested(tmp, tb[NFTA_DYNSET_EXPRESSIONS], left) { if (i == NFT_SET_EXPR_MAX) { err = -E2BIG; goto err_expr_free; } if (nla_type(tmp) != NFTA_LIST_ELEM) { err = -EINVAL; goto err_expr_free; } dynset_expr = nft_dynset_expr_alloc(ctx, set, tmp, i); if (IS_ERR(dynset_expr)) { err = PTR_ERR(dynset_expr); goto err_expr_free; } priv->expr_array[i] = dynset_expr; priv->num_exprs++; if (set->num_exprs) { if (i >= set->num_exprs) { err = -EINVAL; goto err_expr_free; } if (dynset_expr->ops != set->exprs[i]->ops) { err = -EOPNOTSUPP; goto err_expr_free; } } i++; } if (set->num_exprs && set->num_exprs != i) { err = -EOPNOTSUPP; goto err_expr_free; } } else if (set->num_exprs > 0) { err = nft_set_elem_expr_clone(ctx, set, priv->expr_array); if (err < 0) return err; priv->num_exprs = set->num_exprs; } nft_set_ext_prepare(&priv->tmpl); nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen); if (set->flags & NFT_SET_MAP) nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_DATA, set->dlen); if (priv->num_exprs) nft_dynset_ext_add_expr(priv); if (set->flags & NFT_SET_TIMEOUT && (timeout || READ_ONCE(set->timeout))) nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_TIMEOUT); priv->timeout = timeout; err = nf_tables_bind_set(ctx, set, &priv->binding); if (err < 0) goto err_expr_free; if (set->size == 0) set->size = 0xffff; priv->set = set; return 0; err_expr_free: for (i = 0; i < priv->num_exprs; i++) nft_expr_destroy(ctx, priv->expr_array[i]); return err; } static void nft_dynset_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_dynset *priv = nft_expr_priv(expr); nf_tables_deactivate_set(ctx, priv->set, &priv->binding, phase); } static void nft_dynset_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_dynset *priv = nft_expr_priv(expr); nf_tables_activate_set(ctx, priv->set); } static void nft_dynset_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_dynset *priv = nft_expr_priv(expr); int i; for (i = 0; i < priv->num_exprs; i++) nft_expr_destroy(ctx, priv->expr_array[i]); nf_tables_destroy_set(ctx, priv->set); } static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_dynset *priv = nft_expr_priv(expr); u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0; int i; if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key)) goto nla_put_failure; if (priv->set->flags & NFT_SET_MAP && nft_dump_register(skb, NFTA_DYNSET_SREG_DATA, priv->sreg_data)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_DYNSET_OP, htonl(priv->op))) goto nla_put_failure; if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name)) goto nla_put_failure; if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, nf_jiffies64_to_msecs(priv->timeout), NFTA_DYNSET_PAD)) goto nla_put_failure; if (priv->set->num_exprs == 0) { if (priv->num_exprs == 1) { if (nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr_array[0], reset)) goto nla_put_failure; } else if (priv->num_exprs > 1) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, NFTA_DYNSET_EXPRESSIONS); if (!nest) goto nla_put_failure; for (i = 0; i < priv->num_exprs; i++) { if (nft_expr_dump(skb, NFTA_LIST_ELEM, priv->expr_array[i], reset)) goto nla_put_failure; } nla_nest_end(skb, nest); } } if (nla_put_be32(skb, NFTA_DYNSET_FLAGS, htonl(flags))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static const struct nft_expr_ops nft_dynset_ops = { .type = &nft_dynset_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_dynset)), .eval = nft_dynset_eval, .init = nft_dynset_init, .destroy = nft_dynset_destroy, .activate = nft_dynset_activate, .deactivate = nft_dynset_deactivate, .dump = nft_dynset_dump, .reduce = NFT_REDUCE_READONLY, }; struct nft_expr_type nft_dynset_type __read_mostly = { .name = "dynset", .ops = &nft_dynset_ops, .policy = nft_dynset_policy, .maxattr = NFTA_DYNSET_MAX, .owner = THIS_MODULE, }; |
| 10 10 10 10 4 4 4 14 3 11 2 10 2 7 3 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 | // SPDX-License-Identifier: GPL-2.0 /* * FUSE inode io modes. * * Copyright (c) 2024 CTERA Networks. */ #include "fuse_i.h" #include <linux/kernel.h> #include <linux/sched.h> #include <linux/file.h> #include <linux/fs.h> /* * Return true if need to wait for new opens in caching mode. */ static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi) { return READ_ONCE(fi->iocachectr) < 0 && !fuse_inode_backing(fi); } /* * Called on cached file open() and on first mmap() of direct_io file. * Takes cached_io inode mode reference to be dropped on file release. * * Blocks new parallel dio writes and waits for the in-progress parallel dio * writes to complete. */ int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff) { struct fuse_inode *fi = get_fuse_inode(inode); /* There are no io modes if server does not implement open */ if (!ff->args) return 0; spin_lock(&fi->lock); /* * Setting the bit advises new direct-io writes to use an exclusive * lock - without it the wait below might be forever. */ while (fuse_is_io_cache_wait(fi)) { set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); spin_unlock(&fi->lock); wait_event(fi->direct_io_waitq, !fuse_is_io_cache_wait(fi)); spin_lock(&fi->lock); } /* * Check if inode entered passthrough io mode while waiting for parallel * dio write completion. */ if (fuse_inode_backing(fi)) { clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); spin_unlock(&fi->lock); return -ETXTBSY; } WARN_ON(ff->iomode == IOM_UNCACHED); if (ff->iomode == IOM_NONE) { ff->iomode = IOM_CACHED; if (fi->iocachectr == 0) set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); fi->iocachectr++; } spin_unlock(&fi->lock); return 0; } static void fuse_file_cached_io_release(struct fuse_file *ff, struct fuse_inode *fi) { spin_lock(&fi->lock); WARN_ON(fi->iocachectr <= 0); WARN_ON(ff->iomode != IOM_CACHED); ff->iomode = IOM_NONE; fi->iocachectr--; if (fi->iocachectr == 0) clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); spin_unlock(&fi->lock); } /* Start strictly uncached io mode where cache access is not allowed */ int fuse_inode_uncached_io_start(struct fuse_inode *fi, struct fuse_backing *fb) { struct fuse_backing *oldfb; int err = 0; spin_lock(&fi->lock); /* deny conflicting backing files on same fuse inode */ oldfb = fuse_inode_backing(fi); if (fb && oldfb && oldfb != fb) { err = -EBUSY; goto unlock; } if (fi->iocachectr > 0) { err = -ETXTBSY; goto unlock; } fi->iocachectr--; /* fuse inode holds a single refcount of backing file */ if (fb && !oldfb) { oldfb = fuse_inode_backing_set(fi, fb); WARN_ON_ONCE(oldfb != NULL); } else { fuse_backing_put(fb); } unlock: spin_unlock(&fi->lock); return err; } /* Takes uncached_io inode mode reference to be dropped on file release */ static int fuse_file_uncached_io_open(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb) { struct fuse_inode *fi = get_fuse_inode(inode); int err; err = fuse_inode_uncached_io_start(fi, fb); if (err) return err; WARN_ON(ff->iomode != IOM_NONE); ff->iomode = IOM_UNCACHED; return 0; } void fuse_inode_uncached_io_end(struct fuse_inode *fi) { struct fuse_backing *oldfb = NULL; spin_lock(&fi->lock); WARN_ON(fi->iocachectr >= 0); fi->iocachectr++; if (!fi->iocachectr) { wake_up(&fi->direct_io_waitq); oldfb = fuse_inode_backing_set(fi, NULL); } spin_unlock(&fi->lock); if (oldfb) fuse_backing_put(oldfb); } /* Drop uncached_io reference from passthrough open */ static void fuse_file_uncached_io_release(struct fuse_file *ff, struct fuse_inode *fi) { WARN_ON(ff->iomode != IOM_UNCACHED); ff->iomode = IOM_NONE; fuse_inode_uncached_io_end(fi); } /* * Open flags that are allowed in combination with FOPEN_PASSTHROUGH. * A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write * operations go directly to the server, but mmap is done on the backing file. * FOPEN_PASSTHROUGH mode should not co-exist with any users of the fuse inode * page cache, so FOPEN_KEEP_CACHE is a strange and undesired combination. */ #define FOPEN_PASSTHROUGH_MASK \ (FOPEN_PASSTHROUGH | FOPEN_DIRECT_IO | FOPEN_PARALLEL_DIRECT_WRITES | \ FOPEN_NOFLUSH) static int fuse_file_passthrough_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_backing *fb; int err; /* Check allowed conditions for file open in passthrough mode */ if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) || !fc->passthrough || (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK)) return -EINVAL; fb = fuse_passthrough_open(file, inode, ff->args->open_outarg.backing_id); if (IS_ERR(fb)) return PTR_ERR(fb); /* First passthrough file open denies caching inode io mode */ err = fuse_file_uncached_io_open(inode, ff, fb); if (!err) return 0; fuse_passthrough_release(ff, fb); fuse_backing_put(fb); return err; } /* Request access to submit new io to inode via open file */ int fuse_file_io_open(struct file *file, struct inode *inode) { struct fuse_file *ff = file->private_data; struct fuse_inode *fi = get_fuse_inode(inode); int err; /* * io modes are not relevant with DAX and with server that does not * implement open. */ if (FUSE_IS_DAX(inode) || !ff->args) return 0; /* * Server is expected to use FOPEN_PASSTHROUGH for all opens of an inode * which is already open for passthrough. */ err = -EINVAL; if (fuse_inode_backing(fi) && !(ff->open_flags & FOPEN_PASSTHROUGH)) goto fail; /* * FOPEN_PARALLEL_DIRECT_WRITES requires FOPEN_DIRECT_IO. */ if (!(ff->open_flags & FOPEN_DIRECT_IO)) ff->open_flags &= ~FOPEN_PARALLEL_DIRECT_WRITES; /* * First passthrough file open denies caching inode io mode. * First caching file open enters caching inode io mode. * * Note that if user opens a file open with O_DIRECT, but server did * not specify FOPEN_DIRECT_IO, a later fcntl() could remove O_DIRECT, * so we put the inode in caching mode to prevent parallel dio. */ if ((ff->open_flags & FOPEN_DIRECT_IO) && !(ff->open_flags & FOPEN_PASSTHROUGH)) return 0; if (ff->open_flags & FOPEN_PASSTHROUGH) err = fuse_file_passthrough_open(inode, file); else err = fuse_file_cached_io_open(inode, ff); if (err) goto fail; return 0; fail: pr_debug("failed to open file in requested io mode (open_flags=0x%x, err=%i).\n", ff->open_flags, err); /* * The file open mode determines the inode io mode. * Using incorrect open mode is a server mistake, which results in * user visible failure of open() with EIO error. */ return -EIO; } /* No more pending io and no new io possible to inode via open/mmapped file */ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); /* * Last passthrough file close allows caching inode io mode. * Last caching file close exits caching inode io mode. */ switch (ff->iomode) { case IOM_NONE: /* Nothing to do */ break; case IOM_UNCACHED: fuse_file_uncached_io_release(ff, fi); break; case IOM_CACHED: fuse_file_cached_io_release(ff, fi); break; } } |
| 1538 1537 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2012 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/if_arp.h> #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/kernel.h> #include <linux/llc.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/openvswitch.h> #include <linux/export.h> #include <net/ip_tunnels.h> #include <net/rtnetlink.h> #include "datapath.h" #include "vport.h" #include "vport-internal_dev.h" #include "vport-netdev.h" static struct vport_ops ovs_netdev_vport_ops; /* Must be called with rcu_read_lock. */ static void netdev_port_receive(struct sk_buff *skb) { struct vport *vport; vport = ovs_netdev_get_vport(skb->dev); if (unlikely(!vport)) goto error; if (unlikely(skb_warn_if_lro(skb))) goto error; /* Make our own copy of the packet. Otherwise we will mangle the * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return; if (skb->dev->type == ARPHRD_ETHER) skb_push_rcsum(skb, ETH_HLEN); ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: kfree_skb(skb); } /* Called with rcu_read_lock and bottom-halves disabled. */ static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; netdev_port_receive(skb); return RX_HANDLER_CONSUMED; } static struct net_device *get_dpdev(const struct datapath *dp) { struct vport *local; local = ovs_vport_ovsl(dp, OVSP_LOCAL); return local->dev; } struct vport *ovs_netdev_link(struct vport *vport, const char *name) { int err; vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); if (!vport->dev) { err = -ENODEV; goto error_free_vport; } /* Ensure that the device exists and that the provided * name is not one of its aliases. */ if (strcmp(name, ovs_vport_name(vport))) { err = -ENODEV; goto error_put; } netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL); if (vport->dev->flags & IFF_LOOPBACK || (vport->dev->type != ARPHRD_ETHER && vport->dev->type != ARPHRD_NONE) || ovs_is_internal_dev(vport->dev)) { err = -EINVAL; goto error_put; } rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, get_dpdev(vport->dp), NULL, NULL, NULL); if (err) goto error_unlock; err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, vport); if (err) goto error_master_upper_dev_unlink; dev_disable_lro(vport->dev); dev_set_promiscuity(vport->dev, 1); vport->dev->priv_flags |= IFF_OVS_DATAPATH; rtnl_unlock(); return vport; error_master_upper_dev_unlink: netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); error_unlock: rtnl_unlock(); error_put: netdev_put(vport->dev, &vport->dev_tracker); error_free_vport: ovs_vport_free(vport); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(ovs_netdev_link); static struct vport *netdev_create(const struct vport_parms *parms) { struct vport *vport; vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); if (IS_ERR(vport)) return vport; return ovs_netdev_link(vport, parms->name); } static void vport_netdev_free(struct rcu_head *rcu) { struct vport *vport = container_of(rcu, struct vport, rcu); netdev_put(vport->dev, &vport->dev_tracker); ovs_vport_free(vport); } void ovs_netdev_detach_dev(struct vport *vport) { ASSERT_RTNL(); vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; netdev_rx_handler_unregister(vport->dev); netdev_upper_dev_unlink(vport->dev, netdev_master_upper_dev_get(vport->dev)); dev_set_promiscuity(vport->dev, -1); } static void netdev_destroy(struct vport *vport) { rtnl_lock(); if (netif_is_ovs_port(vport->dev)) ovs_netdev_detach_dev(vport); rtnl_unlock(); call_rcu(&vport->rcu, vport_netdev_free); } void ovs_netdev_tunnel_destroy(struct vport *vport) { rtnl_lock(); if (netif_is_ovs_port(vport->dev)) ovs_netdev_detach_dev(vport); /* We can be invoked by both explicit vport deletion and * underlying netdev deregistration; delete the link only * if it's not already shutting down. */ if (vport->dev->reg_state == NETREG_REGISTERED) rtnl_delete_link(vport->dev, 0, NULL); netdev_put(vport->dev, &vport->dev_tracker); vport->dev = NULL; rtnl_unlock(); call_rcu(&vport->rcu, vport_netdev_free); } EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); /* Returns null if this device is not attached to a datapath. */ struct vport *ovs_netdev_get_vport(struct net_device *dev) { if (likely(netif_is_ovs_port(dev))) return (struct vport *) rcu_dereference_rtnl(dev->rx_handler_data); else return NULL; } static struct vport_ops ovs_netdev_vport_ops = { .type = OVS_VPORT_TYPE_NETDEV, .create = netdev_create, .destroy = netdev_destroy, .send = dev_queue_xmit, }; int __init ovs_netdev_init(void) { return ovs_vport_ops_register(&ovs_netdev_vport_ops); } void ovs_netdev_exit(void) { ovs_vport_ops_unregister(&ovs_netdev_vport_ops); } |
| 4 4 4 4 4 3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * SM4 Cipher Algorithm, AES-NI/AVX optimized. * as specified in * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html * * Copyright (c) 2021, Alibaba Group. * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> */ #include <linux/module.h> #include <linux/crypto.h> #include <linux/kernel.h> #include <asm/simd.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/sm4.h> #include "sm4-avx.h" #define SM4_CRYPT8_BLOCK_SIZE (SM4_BLOCK_SIZE * 8) asmlinkage void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst, const u8 *src, int nblocks); asmlinkage void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst, const u8 *src, int nblocks); asmlinkage void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst, const u8 *src, u8 *iv); asmlinkage void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst, const u8 *src, u8 *iv); static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int key_len) { struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); return sm4_expandkey(ctx, key, key_len); } static int ecb_do_crypt(struct skcipher_request *req, const u32 *rkey) { struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; kernel_fpu_begin(); while (nbytes >= SM4_CRYPT8_BLOCK_SIZE) { sm4_aesni_avx_crypt8(rkey, dst, src, 8); dst += SM4_CRYPT8_BLOCK_SIZE; src += SM4_CRYPT8_BLOCK_SIZE; nbytes -= SM4_CRYPT8_BLOCK_SIZE; } while (nbytes >= SM4_BLOCK_SIZE) { unsigned int nblocks = min(nbytes >> 4, 4u); sm4_aesni_avx_crypt4(rkey, dst, src, nblocks); dst += nblocks * SM4_BLOCK_SIZE; src += nblocks * SM4_BLOCK_SIZE; nbytes -= nblocks * SM4_BLOCK_SIZE; } kernel_fpu_end(); err = skcipher_walk_done(&walk, nbytes); } return err; } int sm4_avx_ecb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_do_crypt(req, ctx->rkey_enc); } EXPORT_SYMBOL_GPL(sm4_avx_ecb_encrypt); int sm4_avx_ecb_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_do_crypt(req, ctx->rkey_dec); } EXPORT_SYMBOL_GPL(sm4_avx_ecb_decrypt); int sm4_cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) > 0) { const u8 *iv = walk.iv; const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; while (nbytes >= SM4_BLOCK_SIZE) { crypto_xor_cpy(dst, src, iv, SM4_BLOCK_SIZE); sm4_crypt_block(ctx->rkey_enc, dst, dst); iv = dst; src += SM4_BLOCK_SIZE; dst += SM4_BLOCK_SIZE; nbytes -= SM4_BLOCK_SIZE; } if (iv != walk.iv) memcpy(walk.iv, iv, SM4_BLOCK_SIZE); err = skcipher_walk_done(&walk, nbytes); } return err; } EXPORT_SYMBOL_GPL(sm4_cbc_encrypt); int sm4_avx_cbc_decrypt(struct skcipher_request *req, unsigned int bsize, sm4_crypt_func func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; kernel_fpu_begin(); while (nbytes >= bsize) { func(ctx->rkey_dec, dst, src, walk.iv); dst += bsize; src += bsize; nbytes -= bsize; } while (nbytes >= SM4_BLOCK_SIZE) { u8 keystream[SM4_BLOCK_SIZE * 8]; u8 iv[SM4_BLOCK_SIZE]; unsigned int nblocks = min(nbytes >> 4, 8u); int i; sm4_aesni_avx_crypt8(ctx->rkey_dec, keystream, src, nblocks); src += ((int)nblocks - 2) * SM4_BLOCK_SIZE; dst += (nblocks - 1) * SM4_BLOCK_SIZE; memcpy(iv, src + SM4_BLOCK_SIZE, SM4_BLOCK_SIZE); for (i = nblocks - 1; i > 0; i--) { crypto_xor_cpy(dst, src, &keystream[i * SM4_BLOCK_SIZE], SM4_BLOCK_SIZE); src -= SM4_BLOCK_SIZE; dst -= SM4_BLOCK_SIZE; } crypto_xor_cpy(dst, walk.iv, keystream, SM4_BLOCK_SIZE); memcpy(walk.iv, iv, SM4_BLOCK_SIZE); dst += nblocks * SM4_BLOCK_SIZE; src += (nblocks + 1) * SM4_BLOCK_SIZE; nbytes -= nblocks * SM4_BLOCK_SIZE; } kernel_fpu_end(); err = skcipher_walk_done(&walk, nbytes); } return err; } EXPORT_SYMBOL_GPL(sm4_avx_cbc_decrypt); static int cbc_decrypt(struct skcipher_request *req) { return sm4_avx_cbc_decrypt(req, SM4_CRYPT8_BLOCK_SIZE, sm4_aesni_avx_cbc_dec_blk8); } int sm4_avx_ctr_crypt(struct skcipher_request *req, unsigned int bsize, sm4_crypt_func func) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) > 0) { const u8 *src = walk.src.virt.addr; u8 *dst = walk.dst.virt.addr; kernel_fpu_begin(); while (nbytes >= bsize) { func(ctx->rkey_enc, dst, src, walk.iv); dst += bsize; src += bsize; nbytes -= bsize; } while (nbytes >= SM4_BLOCK_SIZE) { u8 keystream[SM4_BLOCK_SIZE * 8]; unsigned int nblocks = min(nbytes >> 4, 8u); int i; for (i = 0; i < nblocks; i++) { memcpy(&keystream[i * SM4_BLOCK_SIZE], walk.iv, SM4_BLOCK_SIZE); crypto_inc(walk.iv, SM4_BLOCK_SIZE); } sm4_aesni_avx_crypt8(ctx->rkey_enc, keystream, keystream, nblocks); crypto_xor_cpy(dst, src, keystream, nblocks * SM4_BLOCK_SIZE); dst += nblocks * SM4_BLOCK_SIZE; src += nblocks * SM4_BLOCK_SIZE; nbytes -= nblocks * SM4_BLOCK_SIZE; } kernel_fpu_end(); /* tail */ if (walk.nbytes == walk.total && nbytes > 0) { u8 keystream[SM4_BLOCK_SIZE]; memcpy(keystream, walk.iv, SM4_BLOCK_SIZE); crypto_inc(walk.iv, SM4_BLOCK_SIZE); sm4_crypt_block(ctx->rkey_enc, keystream, keystream); crypto_xor_cpy(dst, src, keystream, nbytes); dst += nbytes; src += nbytes; nbytes = 0; } err = skcipher_walk_done(&walk, nbytes); } return err; } EXPORT_SYMBOL_GPL(sm4_avx_ctr_crypt); static int ctr_crypt(struct skcipher_request *req) { return sm4_avx_ctr_crypt(req, SM4_CRYPT8_BLOCK_SIZE, sm4_aesni_avx_ctr_enc_blk8); } static struct skcipher_alg sm4_aesni_avx_skciphers[] = { { .base = { .cra_name = "__ecb(sm4)", .cra_driver_name = "__ecb-sm4-aesni-avx", .cra_priority = 400, .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, }, .min_keysize = SM4_KEY_SIZE, .max_keysize = SM4_KEY_SIZE, .walksize = 8 * SM4_BLOCK_SIZE, .setkey = sm4_skcipher_setkey, .encrypt = sm4_avx_ecb_encrypt, .decrypt = sm4_avx_ecb_decrypt, }, { .base = { .cra_name = "__cbc(sm4)", .cra_driver_name = "__cbc-sm4-aesni-avx", .cra_priority = 400, .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, }, .min_keysize = SM4_KEY_SIZE, .max_keysize = SM4_KEY_SIZE, .ivsize = SM4_BLOCK_SIZE, .walksize = 8 * SM4_BLOCK_SIZE, .setkey = sm4_skcipher_setkey, .encrypt = sm4_cbc_encrypt, .decrypt = cbc_decrypt, }, { .base = { .cra_name = "__ctr(sm4)", .cra_driver_name = "__ctr-sm4-aesni-avx", .cra_priority = 400, .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, }, .min_keysize = SM4_KEY_SIZE, .max_keysize = SM4_KEY_SIZE, .ivsize = SM4_BLOCK_SIZE, .chunksize = SM4_BLOCK_SIZE, .walksize = 8 * SM4_BLOCK_SIZE, .setkey = sm4_skcipher_setkey, .encrypt = ctr_crypt, .decrypt = ctr_crypt, } }; static struct simd_skcipher_alg * simd_sm4_aesni_avx_skciphers[ARRAY_SIZE(sm4_aesni_avx_skciphers)]; static int __init sm4_init(void) { const char *feature_name; if (!boot_cpu_has(X86_FEATURE_AVX) || !boot_cpu_has(X86_FEATURE_AES) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX or AES-NI instructions are not detected.\n"); return -ENODEV; } if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } return simd_register_skciphers_compat(sm4_aesni_avx_skciphers, ARRAY_SIZE(sm4_aesni_avx_skciphers), simd_sm4_aesni_avx_skciphers); } static void __exit sm4_exit(void) { simd_unregister_skciphers(sm4_aesni_avx_skciphers, ARRAY_SIZE(sm4_aesni_avx_skciphers), simd_sm4_aesni_avx_skciphers); } module_init(sm4_init); module_exit(sm4_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>"); MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX optimized"); MODULE_ALIAS_CRYPTO("sm4"); MODULE_ALIAS_CRYPTO("sm4-aesni-avx"); |
| 2 2 1 1 1 4 4 4 4 4 9 5 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 | // SPDX-License-Identifier: GPL-2.0-only /* * Backlight Lowlevel Control Abstraction * * Copyright (C) 2003,2004 Hewlett-Packard Company * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/device.h> #include <linux/backlight.h> #include <linux/notifier.h> #include <linux/ctype.h> #include <linux/err.h> #include <linux/fb.h> #include <linux/slab.h> #ifdef CONFIG_PMAC_BACKLIGHT #include <asm/backlight.h> #endif /** * DOC: overview * * The backlight core supports implementing backlight drivers. * * A backlight driver registers a driver using * devm_backlight_device_register(). The properties of the backlight * driver such as type and max_brightness must be specified. * When the core detect changes in for example brightness or power state * the update_status() operation is called. The backlight driver shall * implement this operation and use it to adjust backlight. * * Several sysfs attributes are provided by the backlight core:: * * - brightness R/W, set the requested brightness level * - actual_brightness RO, the brightness level used by the HW * - max_brightness RO, the maximum brightness level supported * * See Documentation/ABI/stable/sysfs-class-backlight for the full list. * * The backlight can be adjusted using the sysfs interface, and * the backlight driver may also support adjusting backlight using * a hot-key or some other platform or firmware specific way. * * The driver must implement the get_brightness() operation if * the HW do not support all the levels that can be specified in * brightness, thus providing user-space access to the actual level * via the actual_brightness attribute. * * When the backlight changes this is reported to user-space using * an uevent connected to the actual_brightness attribute. * When brightness is set by platform specific means, for example * a hot-key to adjust backlight, the driver must notify the backlight * core that brightness has changed using backlight_force_update(). * * The backlight driver core receives notifications from fbdev and * if the event is FB_EVENT_BLANK and if the value of blank, from the * FBIOBLANK ioctrl, results in a change in the backlight state the * update_status() operation is called. */ static struct list_head backlight_dev_list; static struct mutex backlight_dev_list_mutex; static const char *const backlight_types[] = { [BACKLIGHT_RAW] = "raw", [BACKLIGHT_PLATFORM] = "platform", [BACKLIGHT_FIRMWARE] = "firmware", }; static const char *const backlight_scale_types[] = { [BACKLIGHT_SCALE_UNKNOWN] = "unknown", [BACKLIGHT_SCALE_LINEAR] = "linear", [BACKLIGHT_SCALE_NON_LINEAR] = "non-linear", }; #if defined(CONFIG_FB_CORE) || (defined(CONFIG_FB_CORE_MODULE) && \ defined(CONFIG_BACKLIGHT_CLASS_DEVICE_MODULE)) /* * fb_notifier_callback * * This callback gets called when something important happens inside a * framebuffer driver. The backlight core only cares about FB_BLANK_UNBLANK * which is reported to the driver using backlight_update_status() * as a state change. * * There may be several fbdev's connected to the backlight device, * in which case they are kept track of. A state change is only reported * if there is a change in backlight for the specified fbdev. */ static int fb_notifier_callback(struct notifier_block *self, unsigned long event, void *data) { struct backlight_device *bd; struct fb_event *evdata = data; struct fb_info *info = evdata->info; struct backlight_device *fb_bd = fb_bl_device(info); int node = info->node; int fb_blank = 0; /* If we aren't interested in this event, skip it immediately ... */ if (event != FB_EVENT_BLANK) return 0; bd = container_of(self, struct backlight_device, fb_notif); mutex_lock(&bd->ops_lock); if (!bd->ops) goto out; if (bd->ops->controls_device && !bd->ops->controls_device(bd, info->device)) goto out; if (fb_bd && fb_bd != bd) goto out; fb_blank = *(int *)evdata->data; if (fb_blank == FB_BLANK_UNBLANK && !bd->fb_bl_on[node]) { bd->fb_bl_on[node] = true; if (!bd->use_count++) { bd->props.state &= ~BL_CORE_FBBLANK; backlight_update_status(bd); } } else if (fb_blank != FB_BLANK_UNBLANK && bd->fb_bl_on[node]) { bd->fb_bl_on[node] = false; if (!(--bd->use_count)) { bd->props.state |= BL_CORE_FBBLANK; backlight_update_status(bd); } } out: mutex_unlock(&bd->ops_lock); return 0; } static int backlight_register_fb(struct backlight_device *bd) { memset(&bd->fb_notif, 0, sizeof(bd->fb_notif)); bd->fb_notif.notifier_call = fb_notifier_callback; return fb_register_client(&bd->fb_notif); } static void backlight_unregister_fb(struct backlight_device *bd) { fb_unregister_client(&bd->fb_notif); } #else static inline int backlight_register_fb(struct backlight_device *bd) { return 0; } static inline void backlight_unregister_fb(struct backlight_device *bd) { } #endif /* CONFIG_FB_CORE */ static void backlight_generate_event(struct backlight_device *bd, enum backlight_update_reason reason) { char *envp[2]; switch (reason) { case BACKLIGHT_UPDATE_SYSFS: envp[0] = "SOURCE=sysfs"; break; case BACKLIGHT_UPDATE_HOTKEY: envp[0] = "SOURCE=hotkey"; break; default: envp[0] = "SOURCE=unknown"; break; } envp[1] = NULL; kobject_uevent_env(&bd->dev.kobj, KOBJ_CHANGE, envp); sysfs_notify(&bd->dev.kobj, NULL, "actual_brightness"); } static ssize_t bl_power_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backlight_device *bd = to_backlight_device(dev); return sprintf(buf, "%d\n", bd->props.power); } static ssize_t bl_power_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int rc; struct backlight_device *bd = to_backlight_device(dev); unsigned long power, old_power; rc = kstrtoul(buf, 0, &power); if (rc) return rc; rc = -ENXIO; mutex_lock(&bd->ops_lock); if (bd->ops) { pr_debug("set power to %lu\n", power); if (bd->props.power != power) { old_power = bd->props.power; bd->props.power = power; rc = backlight_update_status(bd); if (rc) bd->props.power = old_power; else rc = count; } else { rc = count; } } mutex_unlock(&bd->ops_lock); return rc; } static DEVICE_ATTR_RW(bl_power); static ssize_t brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backlight_device *bd = to_backlight_device(dev); return sprintf(buf, "%d\n", bd->props.brightness); } int backlight_device_set_brightness(struct backlight_device *bd, unsigned long brightness) { int rc = -ENXIO; mutex_lock(&bd->ops_lock); if (bd->ops) { if (brightness > bd->props.max_brightness) rc = -EINVAL; else { pr_debug("set brightness to %lu\n", brightness); bd->props.brightness = brightness; rc = backlight_update_status(bd); } } mutex_unlock(&bd->ops_lock); backlight_generate_event(bd, BACKLIGHT_UPDATE_SYSFS); return rc; } EXPORT_SYMBOL(backlight_device_set_brightness); static ssize_t brightness_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int rc; struct backlight_device *bd = to_backlight_device(dev); unsigned long brightness; rc = kstrtoul(buf, 0, &brightness); if (rc) return rc; rc = backlight_device_set_brightness(bd, brightness); return rc ? rc : count; } static DEVICE_ATTR_RW(brightness); static ssize_t type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backlight_device *bd = to_backlight_device(dev); return sprintf(buf, "%s\n", backlight_types[bd->props.type]); } static DEVICE_ATTR_RO(type); static ssize_t max_brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backlight_device *bd = to_backlight_device(dev); return sprintf(buf, "%d\n", bd->props.max_brightness); } static DEVICE_ATTR_RO(max_brightness); static ssize_t actual_brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { int rc = -ENXIO; struct backlight_device *bd = to_backlight_device(dev); mutex_lock(&bd->ops_lock); if (bd->ops && bd->ops->get_brightness) { rc = bd->ops->get_brightness(bd); if (rc >= 0) rc = sprintf(buf, "%d\n", rc); } else { rc = sprintf(buf, "%d\n", bd->props.brightness); } mutex_unlock(&bd->ops_lock); return rc; } static DEVICE_ATTR_RO(actual_brightness); static ssize_t scale_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backlight_device *bd = to_backlight_device(dev); if (WARN_ON(bd->props.scale > BACKLIGHT_SCALE_NON_LINEAR)) return sprintf(buf, "unknown\n"); return sprintf(buf, "%s\n", backlight_scale_types[bd->props.scale]); } static DEVICE_ATTR_RO(scale); #ifdef CONFIG_PM_SLEEP static int backlight_suspend(struct device *dev) { struct backlight_device *bd = to_backlight_device(dev); mutex_lock(&bd->ops_lock); if (bd->ops && bd->ops->options & BL_CORE_SUSPENDRESUME) { bd->props.state |= BL_CORE_SUSPENDED; backlight_update_status(bd); } mutex_unlock(&bd->ops_lock); return 0; } static int backlight_resume(struct device *dev) { struct backlight_device *bd = to_backlight_device(dev); mutex_lock(&bd->ops_lock); if (bd->ops && bd->ops->options & BL_CORE_SUSPENDRESUME) { bd->props.state &= ~BL_CORE_SUSPENDED; backlight_update_status(bd); } mutex_unlock(&bd->ops_lock); return 0; } #endif static SIMPLE_DEV_PM_OPS(backlight_class_dev_pm_ops, backlight_suspend, backlight_resume); static void bl_device_release(struct device *dev) { struct backlight_device *bd = to_backlight_device(dev); kfree(bd); } static struct attribute *bl_device_attrs[] = { &dev_attr_bl_power.attr, &dev_attr_brightness.attr, &dev_attr_actual_brightness.attr, &dev_attr_max_brightness.attr, &dev_attr_scale.attr, &dev_attr_type.attr, NULL, }; ATTRIBUTE_GROUPS(bl_device); static const struct class backlight_class = { .name = "backlight", .dev_groups = bl_device_groups, .pm = &backlight_class_dev_pm_ops, }; /** * backlight_force_update - tell the backlight subsystem that hardware state * has changed * @bd: the backlight device to update * @reason: reason for update * * Updates the internal state of the backlight in response to a hardware event, * and generates an uevent to notify userspace. A backlight driver shall call * backlight_force_update() when the backlight is changed using, for example, * a hot-key. The updated brightness is read using get_brightness() and the * brightness value is reported using an uevent. */ void backlight_force_update(struct backlight_device *bd, enum backlight_update_reason reason) { int brightness; mutex_lock(&bd->ops_lock); if (bd->ops && bd->ops->get_brightness) { brightness = bd->ops->get_brightness(bd); if (brightness >= 0) bd->props.brightness = brightness; else dev_err(&bd->dev, "Could not update brightness from device: %pe\n", ERR_PTR(brightness)); } mutex_unlock(&bd->ops_lock); backlight_generate_event(bd, reason); } EXPORT_SYMBOL(backlight_force_update); /* deprecated - use devm_backlight_device_register() */ struct backlight_device *backlight_device_register(const char *name, struct device *parent, void *devdata, const struct backlight_ops *ops, const struct backlight_properties *props) { struct backlight_device *new_bd; int rc; pr_debug("backlight_device_register: name=%s\n", name); new_bd = kzalloc(sizeof(struct backlight_device), GFP_KERNEL); if (!new_bd) return ERR_PTR(-ENOMEM); mutex_init(&new_bd->update_lock); mutex_init(&new_bd->ops_lock); new_bd->dev.class = &backlight_class; new_bd->dev.parent = parent; new_bd->dev.release = bl_device_release; dev_set_name(&new_bd->dev, "%s", name); dev_set_drvdata(&new_bd->dev, devdata); /* Set default properties */ if (props) { memcpy(&new_bd->props, props, sizeof(struct backlight_properties)); if (props->type <= 0 || props->type >= BACKLIGHT_TYPE_MAX) { WARN(1, "%s: invalid backlight type", name); new_bd->props.type = BACKLIGHT_RAW; } } else { new_bd->props.type = BACKLIGHT_RAW; } rc = device_register(&new_bd->dev); if (rc) { put_device(&new_bd->dev); return ERR_PTR(rc); } rc = backlight_register_fb(new_bd); if (rc) { device_unregister(&new_bd->dev); return ERR_PTR(rc); } new_bd->ops = ops; #ifdef CONFIG_PMAC_BACKLIGHT mutex_lock(&pmac_backlight_mutex); if (!pmac_backlight) pmac_backlight = new_bd; mutex_unlock(&pmac_backlight_mutex); #endif mutex_lock(&backlight_dev_list_mutex); list_add(&new_bd->entry, &backlight_dev_list); mutex_unlock(&backlight_dev_list_mutex); return new_bd; } EXPORT_SYMBOL(backlight_device_register); /** backlight_device_get_by_type - find first backlight device of a type * @type: the type of backlight device * * Look up the first backlight device of the specified type * * RETURNS: * * Pointer to backlight device if any was found. Otherwise NULL. */ struct backlight_device *backlight_device_get_by_type(enum backlight_type type) { bool found = false; struct backlight_device *bd; mutex_lock(&backlight_dev_list_mutex); list_for_each_entry(bd, &backlight_dev_list, entry) { if (bd->props.type == type) { found = true; break; } } mutex_unlock(&backlight_dev_list_mutex); return found ? bd : NULL; } EXPORT_SYMBOL(backlight_device_get_by_type); /** * backlight_device_get_by_name - Get backlight device by name * @name: Device name * * This function looks up a backlight device by its name. It obtains a reference * on the backlight device and it is the caller's responsibility to drop the * reference by calling put_device(). * * Returns: * A pointer to the backlight device if found, otherwise NULL. */ struct backlight_device *backlight_device_get_by_name(const char *name) { struct device *dev; dev = class_find_device_by_name(&backlight_class, name); return dev ? to_backlight_device(dev) : NULL; } EXPORT_SYMBOL(backlight_device_get_by_name); /* deprecated - use devm_backlight_device_unregister() */ void backlight_device_unregister(struct backlight_device *bd) { if (!bd) return; mutex_lock(&backlight_dev_list_mutex); list_del(&bd->entry); mutex_unlock(&backlight_dev_list_mutex); #ifdef CONFIG_PMAC_BACKLIGHT mutex_lock(&pmac_backlight_mutex); if (pmac_backlight == bd) pmac_backlight = NULL; mutex_unlock(&pmac_backlight_mutex); #endif mutex_lock(&bd->ops_lock); bd->ops = NULL; mutex_unlock(&bd->ops_lock); backlight_unregister_fb(bd); device_unregister(&bd->dev); } EXPORT_SYMBOL(backlight_device_unregister); static void devm_backlight_device_release(struct device *dev, void *res) { struct backlight_device *backlight = *(struct backlight_device **)res; backlight_device_unregister(backlight); } static int devm_backlight_device_match(struct device *dev, void *res, void *data) { struct backlight_device **r = res; return *r == data; } /** * devm_backlight_device_register - register a new backlight device * @dev: the device to register * @name: the name of the device * @parent: a pointer to the parent device (often the same as @dev) * @devdata: an optional pointer to be stored for private driver use * @ops: the backlight operations structure * @props: the backlight properties * * Creates and registers new backlight device. When a backlight device * is registered the configuration must be specified in the @props * parameter. See description of &backlight_properties. * * RETURNS: * * struct backlight on success, or an ERR_PTR on error */ struct backlight_device *devm_backlight_device_register(struct device *dev, const char *name, struct device *parent, void *devdata, const struct backlight_ops *ops, const struct backlight_properties *props) { struct backlight_device **ptr, *backlight; ptr = devres_alloc(devm_backlight_device_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) return ERR_PTR(-ENOMEM); backlight = backlight_device_register(name, parent, devdata, ops, props); if (!IS_ERR(backlight)) { *ptr = backlight; devres_add(dev, ptr); } else { devres_free(ptr); } return backlight; } EXPORT_SYMBOL(devm_backlight_device_register); /** * devm_backlight_device_unregister - unregister backlight device * @dev: the device to unregister * @bd: the backlight device to unregister * * Deallocates a backlight allocated with devm_backlight_device_register(). * Normally this function will not need to be called and the resource management * code will ensure that the resources are freed. */ void devm_backlight_device_unregister(struct device *dev, struct backlight_device *bd) { int rc; rc = devres_release(dev, devm_backlight_device_release, devm_backlight_device_match, bd); WARN_ON(rc); } EXPORT_SYMBOL(devm_backlight_device_unregister); #ifdef CONFIG_OF static int of_parent_match(struct device *dev, const void *data) { return dev->parent && dev->parent->of_node == data; } /** * of_find_backlight_by_node() - find backlight device by device-tree node * @node: device-tree node of the backlight device * * Returns a pointer to the backlight device corresponding to the given DT * node or NULL if no such backlight device exists or if the device hasn't * been probed yet. * * This function obtains a reference on the backlight device and it is the * caller's responsibility to drop the reference by calling put_device() on * the backlight device's .dev field. */ struct backlight_device *of_find_backlight_by_node(struct device_node *node) { struct device *dev; dev = class_find_device(&backlight_class, NULL, node, of_parent_match); return dev ? to_backlight_device(dev) : NULL; } EXPORT_SYMBOL(of_find_backlight_by_node); #endif static struct backlight_device *of_find_backlight(struct device *dev) { struct backlight_device *bd = NULL; struct device_node *np; if (!dev) return NULL; if (IS_ENABLED(CONFIG_OF) && dev->of_node) { np = of_parse_phandle(dev->of_node, "backlight", 0); if (np) { bd = of_find_backlight_by_node(np); of_node_put(np); if (!bd) return ERR_PTR(-EPROBE_DEFER); } } return bd; } static void devm_backlight_release(void *data) { struct backlight_device *bd = data; put_device(&bd->dev); } /** * devm_of_find_backlight - find backlight for a device * @dev: the device * * This function looks for a property named 'backlight' on the DT node * connected to @dev and looks up the backlight device. The lookup is * device managed so the reference to the backlight device is automatically * dropped on driver detach. * * RETURNS: * * A pointer to the backlight device if found. * Error pointer -EPROBE_DEFER if the DT property is set, but no backlight * device is found. NULL if there's no backlight property. */ struct backlight_device *devm_of_find_backlight(struct device *dev) { struct backlight_device *bd; int ret; bd = of_find_backlight(dev); if (IS_ERR_OR_NULL(bd)) return bd; ret = devm_add_action_or_reset(dev, devm_backlight_release, bd); if (ret) return ERR_PTR(ret); return bd; } EXPORT_SYMBOL(devm_of_find_backlight); static void __exit backlight_class_exit(void) { class_unregister(&backlight_class); } static int __init backlight_class_init(void) { int ret; ret = class_register(&backlight_class); if (ret) { pr_warn("Unable to create backlight class; errno = %d\n", ret); return ret; } INIT_LIST_HEAD(&backlight_dev_list); mutex_init(&backlight_dev_list_mutex); return 0; } /* * if this is compiled into the kernel, we need to ensure that the * class is registered before users of the class try to register lcd's */ postcore_initcall(backlight_class_init); module_exit(backlight_class_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jamey Hicks <jamey.hicks@hp.com>, Andrew Zabolotny <zap@homelink.ru>"); MODULE_DESCRIPTION("Backlight Lowlevel Control Abstraction"); |
| 6 1 1 4 1 2 2 7 8 8 3034 3034 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/lib/cmdline.c * Helper functions generally used for parsing kernel command line * and module options. * * Code and copyrights come from init/main.c and arch/i386/kernel/setup.c. * * GNU Indent formatting options for this file: -kr -i8 -npsl -pcs */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/ctype.h> /* * If a hyphen was found in get_option, this will handle the * range of numbers, M-N. This will expand the range and insert * the values[M, M+1, ..., N] into the ints array in get_options. */ static int get_range(char **str, int *pint, int n) { int x, inc_counter, upper_range; (*str)++; upper_range = simple_strtol((*str), NULL, 0); inc_counter = upper_range - *pint; for (x = *pint; n && x < upper_range; x++, n--) *pint++ = x; return inc_counter; } /** * get_option - Parse integer from an option string * @str: option string * @pint: (optional output) integer value parsed from @str * * Read an int from an option string; if available accept a subsequent * comma as well. * * When @pint is NULL the function can be used as a validator of * the current option in the string. * * Return values: * 0 - no int in string * 1 - int found, no subsequent comma * 2 - int found including a subsequent comma * 3 - hyphen found to denote a range * * Leading hyphen without integer is no integer case, but we consume it * for the sake of simplification. */ int get_option(char **str, int *pint) { char *cur = *str; int value; if (!cur || !(*cur)) return 0; if (*cur == '-') value = -simple_strtoull(++cur, str, 0); else value = simple_strtoull(cur, str, 0); if (pint) *pint = value; if (cur == *str) return 0; if (**str == ',') { (*str)++; return 2; } if (**str == '-') return 3; return 1; } EXPORT_SYMBOL(get_option); /** * get_options - Parse a string into a list of integers * @str: String to be parsed * @nints: size of integer array * @ints: integer array (must have room for at least one element) * * This function parses a string containing a comma-separated * list of integers, a hyphen-separated range of _positive_ integers, * or a combination of both. The parse halts when the array is * full, or when no more numbers can be retrieved from the * string. * * When @nints is 0, the function just validates the given @str and * returns the amount of parseable integers as described below. * * Returns: * * The first element is filled by the number of collected integers * in the range. The rest is what was parsed from the @str. * * Return value is the character in the string which caused * the parse to end (typically a null terminator, if @str is * completely parseable). */ char *get_options(const char *str, int nints, int *ints) { bool validate = (nints == 0); int res, i = 1; while (i < nints || validate) { int *pint = validate ? ints : ints + i; res = get_option((char **)&str, pint); if (res == 0) break; if (res == 3) { int n = validate ? 0 : nints - i; int range_nums; range_nums = get_range((char **)&str, pint, n); if (range_nums < 0) break; /* * Decrement the result by one to leave out the * last number in the range. The next iteration * will handle the upper number in the range */ i += (range_nums - 1); } i++; if (res == 1) break; } ints[0] = i - 1; return (char *)str; } EXPORT_SYMBOL(get_options); /** * memparse - parse a string with mem suffixes into a number * @ptr: Where parse begins * @retptr: (output) Optional pointer to next char after parse completes * * Parses a string into a number. The number stored at @ptr is * potentially suffixed with K, M, G, T, P, E. */ unsigned long long memparse(const char *ptr, char **retptr) { char *endptr; /* local pointer to end of parsed string */ unsigned long long ret = simple_strtoull(ptr, &endptr, 0); switch (*endptr) { case 'E': case 'e': ret <<= 10; fallthrough; case 'P': case 'p': ret <<= 10; fallthrough; case 'T': case 't': ret <<= 10; fallthrough; case 'G': case 'g': ret <<= 10; fallthrough; case 'M': case 'm': ret <<= 10; fallthrough; case 'K': case 'k': ret <<= 10; endptr++; fallthrough; default: break; } if (retptr) *retptr = endptr; return ret; } EXPORT_SYMBOL(memparse); /** * parse_option_str - Parse a string and check an option is set or not * @str: String to be parsed * @option: option name * * This function parses a string containing a comma-separated list of * strings like a=b,c. * * Return true if there's such option in the string, or return false. */ bool parse_option_str(const char *str, const char *option) { while (*str) { if (!strncmp(str, option, strlen(option))) { str += strlen(option); if (!*str || *str == ',') return true; } while (*str && *str != ',') str++; if (*str == ',') str++; } return false; } /* * Parse a string to get a param value pair. * You can use " around spaces, but can't escape ". * Hyphens and underscores equivalent in parameter names. */ char *next_arg(char *args, char **param, char **val) { unsigned int i, equals = 0; int in_quote = 0, quoted = 0; if (*args == '"') { args++; in_quote = 1; quoted = 1; } for (i = 0; args[i]; i++) { if (isspace(args[i]) && !in_quote) break; if (equals == 0) { if (args[i] == '=') equals = i; } if (args[i] == '"') in_quote = !in_quote; } *param = args; if (!equals) *val = NULL; else { args[equals] = '\0'; *val = args + equals + 1; /* Don't include quotes in value. */ if (**val == '"') { (*val)++; if (args[i-1] == '"') args[i-1] = '\0'; } } if (quoted && i > 0 && args[i-1] == '"') args[i-1] = '\0'; if (args[i]) { args[i] = '\0'; args += i + 1; } else args += i; /* Chew up trailing spaces. */ return skip_spaces(args); } EXPORT_SYMBOL(next_arg); |
| 2669 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Prevent the compiler from merging or refetching reads or writes. The * compiler is also forbidden from reordering successive instances of * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some * particular ordering. One way to make the compiler aware of ordering is to * put the two invocations of READ_ONCE or WRITE_ONCE in different C * statements. * * These two macros will also work on aggregate data types like structs or * unions. * * Their two major use cases are: (1) Mediating communication between * process-level code and irq/NMI handlers, all running on the same CPU, * and (2) Ensuring that the compiler does not fold, spindle, or otherwise * mutilate accesses that either do not require ordering or that interact * with an explicit memory barrier or atomic instruction that provides the * required ordering. */ #ifndef __ASM_GENERIC_RWONCE_H #define __ASM_GENERIC_RWONCE_H #ifndef __ASSEMBLY__ #include <linux/compiler_types.h> #include <linux/kasan-checks.h> #include <linux/kcsan-checks.h> /* * Yes, this permits 64-bit accesses on 32-bit architectures. These will * actually be atomic in some cases (namely Armv7 + LPAE), but for others we * rely on the access being split into 2x32-bit accesses for a 32-bit quantity * (e.g. a virtual address) and a strong prevailing wind. */ #define compiletime_assert_rwonce_type(t) \ compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ "Unsupported access size for {READ,WRITE}_ONCE().") /* * Use __READ_ONCE() instead of READ_ONCE() if you do not require any * atomicity. Note that this may result in tears! */ #ifndef __READ_ONCE #define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) #endif #define READ_ONCE(x) \ ({ \ compiletime_assert_rwonce_type(x); \ __READ_ONCE(x); \ }) #define __WRITE_ONCE(x, val) \ do { \ *(volatile typeof(x) *)&(x) = (val); \ } while (0) #define WRITE_ONCE(x, val) \ do { \ compiletime_assert_rwonce_type(x); \ __WRITE_ONCE(x, val); \ } while (0) static __no_sanitize_or_inline unsigned long __read_once_word_nocheck(const void *addr) { return __READ_ONCE(*(unsigned long *)addr); } /* * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a * word from memory atomically but without telling KASAN/KCSAN. This is * usually used by unwinding code when walking the stack of a running process. */ #define READ_ONCE_NOCHECK(x) \ ({ \ compiletime_assert(sizeof(x) == sizeof(unsigned long), \ "Unsupported access size for READ_ONCE_NOCHECK()."); \ (typeof(x))__read_once_word_nocheck(&(x)); \ }) static __no_kasan_or_inline unsigned long read_word_at_a_time(const void *addr) { kasan_check_read(addr, 1); return *(unsigned long *)addr; } #endif /* __ASSEMBLY__ */ #endif /* __ASM_GENERIC_RWONCE_H */ |
| 490 93 85 490 3 77 3 3 303 216 87 244 3 13 36 3 35 40 40 1 35 47 25 21 16 24 7 22 2 57 5 6 47 1 64 53 8 58 54 4 7 4 198 198 1 149 333 334 6 25 3 9 13 24 3 30 30 2 31 4 3 24 1 15 9 31 31 5 4 2 10 75 2 69 69 64 44 46 46 94 6 1 87 87 156 153 2 141 2 46 95 41 63 51 34 17 13 3 15 9 25 9 23 16 20 3 18 53 14 38 2 16 16 12 19 21 30 13 15 27 18 11 18 11 8 16 26 27 4 11 10 24 15 12 15 15 23 29 1 21 21 5 1 31 31 73 11 61 3 23 22 37 30 14 29 10 4 3 434 433 25 32 24 26 383 371 12 44 18 30 34 1 4 2 5 14 10 22 12 16 2 13 5 4 523 88 42 46 88 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 | // SPDX-License-Identifier: GPL-2.0-only /* File: fs/xattr.c Extended attribute handling. Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org> Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com> Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/fs.h> #include <linux/filelock.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/xattr.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fsnotify.h> #include <linux/audit.h> #include <linux/vmalloc.h> #include <linux/posix_acl_xattr.h> #include <linux/uaccess.h> #include "internal.h" static const char * strcmp_prefix(const char *a, const char *a_prefix) { while (*a_prefix && *a == *a_prefix) { a++; a_prefix++; } return *a_prefix ? NULL : a; } /* * In order to implement different sets of xattr operations for each xattr * prefix, a filesystem should create a null-terminated array of struct * xattr_handler (one for each prefix) and hang a pointer to it off of the * s_xattr field of the superblock. */ #define for_each_xattr_handler(handlers, handler) \ if (handlers) \ for ((handler) = *(handlers)++; \ (handler) != NULL; \ (handler) = *(handlers)++) /* * Find the xattr_handler with the matching prefix. */ static const struct xattr_handler * xattr_resolve_name(struct inode *inode, const char **name) { const struct xattr_handler * const *handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; if (!(inode->i_opflags & IOP_XATTR)) { if (unlikely(is_bad_inode(inode))) return ERR_PTR(-EIO); return ERR_PTR(-EOPNOTSUPP); } for_each_xattr_handler(handlers, handler) { const char *n; n = strcmp_prefix(*name, xattr_prefix(handler)); if (n) { if (!handler->prefix ^ !*n) { if (*n) continue; return ERR_PTR(-EINVAL); } *name = n; return handler; } } return ERR_PTR(-EOPNOTSUPP); } /** * may_write_xattr - check whether inode allows writing xattr * @idmap: idmap of the mount the inode was found from * @inode: the inode on which to set an xattr * * Check whether the inode allows writing xattrs. Specifically, we can never * set or remove an extended attribute on a read-only filesystem or on an * immutable / append-only inode. * * We also need to ensure that the inode has a mapping in the mount to * not risk writing back invalid i_{g,u}id values. * * Return: On success zero is returned. On error a negative errno is returned. */ int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode) { if (IS_IMMUTABLE(inode)) return -EPERM; if (IS_APPEND(inode)) return -EPERM; if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; return 0; } /* * Check permissions for extended attribute access. This is a bit complicated * because different namespaces have very different rules. */ static int xattr_permission(struct mnt_idmap *idmap, struct inode *inode, const char *name, int mask) { if (mask & MAY_WRITE) { int ret; ret = may_write_xattr(idmap, inode); if (ret) return ret; } /* * No restriction for security.* and system.* from the VFS. Decision * on these is left to the underlying filesystem / security module. */ if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return 0; /* * The trusted.* namespace can only be accessed by privileged users. */ if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { if (!capable(CAP_SYS_ADMIN)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; return 0; } /* * In the user.* namespace, only regular files and directories can have * extended attributes. For sticky directories, only the owner and * privileged users can write attributes. */ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && (mask & MAY_WRITE) && !inode_owner_or_capable(idmap, inode)) return -EPERM; } return inode_permission(idmap, inode, mask); } /* * Look for any handler that deals with the specified namespace. */ int xattr_supports_user_prefix(struct inode *inode) { const struct xattr_handler * const *handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; if (!(inode->i_opflags & IOP_XATTR)) { if (unlikely(is_bad_inode(inode))) return -EIO; return -EOPNOTSUPP; } for_each_xattr_handler(handlers, handler) { if (!strncmp(xattr_prefix(handler), XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) return 0; } return -EOPNOTSUPP; } EXPORT_SYMBOL(xattr_supports_user_prefix); int __vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { const struct xattr_handler *handler; if (is_posix_acl_xattr(name)) return -EOPNOTSUPP; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; if (size == 0) value = ""; /* empty EA, do not remove */ return handler->set(handler, idmap, dentry, inode, name, value, size, flags); } EXPORT_SYMBOL(__vfs_setxattr); /** * __vfs_setxattr_noperm - perform setxattr operation without performing * permission checks. * * @idmap: idmap of the mount the inode was found from * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to * @size: size of @value * @flags: flags to pass into filesystem operations * * returns the result of the internal setxattr or setsecurity operations. * * This function requires the caller to lock the inode's i_mutex before it * is executed. It also assumes that the caller will make the appropriate * permission checks. */ int __vfs_setxattr_noperm(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; int error = -EAGAIN; int issec = !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); if (issec) inode->i_flags &= ~S_NOSEC; if (inode->i_opflags & IOP_XATTR) { error = __vfs_setxattr(idmap, dentry, inode, name, value, size, flags); if (!error) { fsnotify_xattr(dentry); security_inode_post_setxattr(dentry, name, value, size, flags); } } else { if (unlikely(is_bad_inode(inode))) return -EIO; } if (error == -EAGAIN) { error = -EOPNOTSUPP; if (issec) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; error = security_inode_setsecurity(inode, suffix, value, size, flags); if (!error) fsnotify_xattr(dentry); } } return error; } /** * __vfs_setxattr_locked - set an extended attribute while holding the inode * lock * * @idmap: idmap of the mount of the target inode * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to * @size: size of @value * @flags: flags to pass into filesystem operations * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int __vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(idmap, inode, name, MAY_WRITE); if (error) return error; error = security_inode_setxattr(idmap, dentry, name, value, size, flags); if (error) goto out; error = try_break_deleg(inode, delegated_inode); if (error) goto out; error = __vfs_setxattr_noperm(idmap, dentry, name, value, size, flags); out: return error; } EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); int vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; const void *orig_value = value; int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { error = cap_convert_nscap(idmap, dentry, &value, size); if (error < 0) return error; size = error; } retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(idmap, dentry, name, value, size, flags, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } if (value != orig_value) kfree(value); return error; } EXPORT_SYMBOL_GPL(vfs_setxattr); static ssize_t xattr_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void *value, size_t size) { void *buffer = NULL; ssize_t len; if (!value || !size) { len = security_inode_getsecurity(idmap, inode, name, &buffer, false); goto out_noalloc; } len = security_inode_getsecurity(idmap, inode, name, &buffer, true); if (len < 0) return len; if (size < len) { len = -ERANGE; goto out; } memcpy(value, buffer, len); out: kfree(buffer); out_noalloc: return len; } /* * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr * * Allocate memory, if not already allocated, or re-allocate correct size, * before retrieving the extended attribute. The xattr value buffer should * always be freed by the caller, even on error. * * Returns the result of alloc, if failed, or the getxattr operation. */ int vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, char **xattr_value, size_t xattr_size, gfp_t flags) { const struct xattr_handler *handler; struct inode *inode = dentry->d_inode; char *value = *xattr_value; int error; error = xattr_permission(idmap, inode, name, MAY_READ); if (error) return error; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->get) return -EOPNOTSUPP; error = handler->get(handler, dentry, inode, name, NULL, 0); if (error < 0) return error; if (!value || (error > xattr_size)) { value = krealloc(*xattr_value, error + 1, flags); if (!value) return -ENOMEM; memset(value, 0, error + 1); } error = handler->get(handler, dentry, inode, name, value, error); *xattr_value = value; return error; } ssize_t __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { const struct xattr_handler *handler; if (is_posix_acl_xattr(name)) return -EOPNOTSUPP; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->get) return -EOPNOTSUPP; return handler->get(handler, dentry, inode, name, value, size); } EXPORT_SYMBOL(__vfs_getxattr); ssize_t vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, void *value, size_t size) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(idmap, inode, name, MAY_READ); if (error) return error; error = security_inode_getxattr(dentry, name); if (error) return error; if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; int ret = xattr_getsecurity(idmap, inode, suffix, value, size); /* * Only overwrite the return value if a security module * is actually active. */ if (ret == -EOPNOTSUPP) goto nolsm; return ret; } nolsm: return __vfs_getxattr(dentry, inode, name, value, size); } EXPORT_SYMBOL_GPL(vfs_getxattr); /** * vfs_listxattr - retrieve \0 separated list of xattr names * @dentry: the dentry from whose inode the xattr names are retrieved * @list: buffer to store xattr names into * @size: size of the buffer * * This function returns the names of all xattrs associated with the * inode of @dentry. * * Note, for legacy reasons the vfs_listxattr() function lists POSIX * ACLs as well. Since POSIX ACLs are decoupled from IOP_XATTR the * vfs_listxattr() function doesn't check for this flag since a * filesystem could implement POSIX ACLs without implementing any other * xattrs. * * However, since all codepaths that remove IOP_XATTR also assign of * inode operations that either don't implement or implement a stub * ->listxattr() operation. * * Return: On success, the size of the buffer that was used. On error a * negative error code. */ ssize_t vfs_listxattr(struct dentry *dentry, char *list, size_t size) { struct inode *inode = d_inode(dentry); ssize_t error; error = security_inode_listxattr(dentry); if (error) return error; if (inode->i_op->listxattr) { error = inode->i_op->listxattr(dentry, list, size); } else { error = security_inode_listsecurity(inode, list, size); if (size && error > size) error = -ERANGE; } return error; } EXPORT_SYMBOL_GPL(vfs_listxattr); int __vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = d_inode(dentry); const struct xattr_handler *handler; if (is_posix_acl_xattr(name)) return -EOPNOTSUPP; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; return handler->set(handler, idmap, dentry, inode, name, NULL, 0, XATTR_REPLACE); } EXPORT_SYMBOL(__vfs_removexattr); /** * __vfs_removexattr_locked - set an extended attribute while holding the inode * lock * * @idmap: idmap of the mount of the target inode * @dentry: object to perform setxattr on * @name: name of xattr to remove * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int __vfs_removexattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(idmap, inode, name, MAY_WRITE); if (error) return error; error = security_inode_removexattr(idmap, dentry, name); if (error) goto out; error = try_break_deleg(inode, delegated_inode); if (error) goto out; error = __vfs_removexattr(idmap, dentry, name); if (error) return error; fsnotify_xattr(dentry); security_inode_post_removexattr(dentry, name); out: return error; } EXPORT_SYMBOL_GPL(__vfs_removexattr_locked); int vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; int error; retry_deleg: inode_lock(inode); error = __vfs_removexattr_locked(idmap, dentry, name, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); int import_xattr_name(struct xattr_name *kname, const char __user *name) { int error = strncpy_from_user(kname->name, name, sizeof(kname->name)); if (error == 0 || error == sizeof(kname->name)) return -ERANGE; if (error < 0) return error; return 0; } /* * Extended attribute SET operations */ int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx) { int error; if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE)) return -EINVAL; error = import_xattr_name(ctx->kname, name); if (error) return error; if (ctx->size) { if (ctx->size > XATTR_SIZE_MAX) return -E2BIG; ctx->kvalue = vmemdup_user(ctx->cvalue, ctx->size); if (IS_ERR(ctx->kvalue)) { error = PTR_ERR(ctx->kvalue); ctx->kvalue = NULL; } } return error; } static int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct kernel_xattr_ctx *ctx) { if (is_posix_acl_xattr(ctx->kname->name)) return do_set_acl(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size); return vfs_setxattr(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size, ctx->flags); } int file_setxattr(struct file *f, struct kernel_xattr_ctx *ctx) { int error = mnt_want_write_file(f); if (!error) { audit_file(f); error = do_setxattr(file_mnt_idmap(f), f->f_path.dentry, ctx); mnt_drop_write_file(f); } return error; } /* unconditionally consumes filename */ int filename_setxattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct kernel_xattr_ctx *ctx) { struct path path; int error; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = mnt_want_write(path.mnt); if (!error) { error = do_setxattr(mnt_idmap(path.mnt), path.dentry, ctx); mnt_drop_write(path.mnt); } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: putname(filename); return error; } static int path_setxattrat(int dfd, const char __user *pathname, unsigned int at_flags, const char __user *name, const void __user *value, size_t size, int flags) { struct xattr_name kname; struct kernel_xattr_ctx ctx = { .cvalue = value, .kvalue = NULL, .size = size, .kname = &kname, .flags = flags, }; struct filename *filename; unsigned int lookup_flags = 0; int error; if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; if (!(at_flags & AT_SYMLINK_NOFOLLOW)) lookup_flags = LOOKUP_FOLLOW; error = setxattr_copy(name, &ctx); if (error) return error; filename = getname_maybe_null(pathname, at_flags); if (!filename) { CLASS(fd, f)(dfd); if (fd_empty(f)) error = -EBADF; else error = file_setxattr(fd_file(f), &ctx); } else { error = filename_setxattr(dfd, filename, lookup_flags, &ctx); } kvfree(ctx.kvalue); return error; } SYSCALL_DEFINE6(setxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, const char __user *, name, const struct xattr_args __user *, uargs, size_t, usize) { struct xattr_args args = {}; int error; BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0); BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST); if (unlikely(usize < XATTR_ARGS_SIZE_VER0)) return -EINVAL; if (usize > PAGE_SIZE) return -E2BIG; error = copy_struct_from_user(&args, sizeof(args), uargs, usize); if (error) return error; return path_setxattrat(dfd, pathname, at_flags, name, u64_to_user_ptr(args.value), args.size, args.flags); } SYSCALL_DEFINE5(setxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { return path_setxattrat(AT_FDCWD, pathname, 0, name, value, size, flags); } SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { return path_setxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name, value, size, flags); } SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, const void __user *,value, size_t, size, int, flags) { return path_setxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size, flags); } /* * Extended attribute GET operations */ static ssize_t do_getxattr(struct mnt_idmap *idmap, struct dentry *d, struct kernel_xattr_ctx *ctx) { ssize_t error; char *kname = ctx->kname->name; void *kvalue = NULL; if (ctx->size) { if (ctx->size > XATTR_SIZE_MAX) ctx->size = XATTR_SIZE_MAX; kvalue = kvzalloc(ctx->size, GFP_KERNEL); if (!kvalue) return -ENOMEM; } if (is_posix_acl_xattr(kname)) error = do_get_acl(idmap, d, kname, kvalue, ctx->size); else error = vfs_getxattr(idmap, d, kname, kvalue, ctx->size); if (error > 0) { if (ctx->size && copy_to_user(ctx->value, kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { /* The file system tried to returned a value bigger than XATTR_SIZE_MAX bytes. Not possible. */ error = -E2BIG; } kvfree(kvalue); return error; } ssize_t file_getxattr(struct file *f, struct kernel_xattr_ctx *ctx) { audit_file(f); return do_getxattr(file_mnt_idmap(f), f->f_path.dentry, ctx); } /* unconditionally consumes filename */ ssize_t filename_getxattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct kernel_xattr_ctx *ctx) { struct path path; ssize_t error; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = do_getxattr(mnt_idmap(path.mnt), path.dentry, ctx); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: putname(filename); return error; } static ssize_t path_getxattrat(int dfd, const char __user *pathname, unsigned int at_flags, const char __user *name, void __user *value, size_t size) { struct xattr_name kname; struct kernel_xattr_ctx ctx = { .value = value, .size = size, .kname = &kname, .flags = 0, }; struct filename *filename; ssize_t error; if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; error = import_xattr_name(&kname, name); if (error) return error; filename = getname_maybe_null(pathname, at_flags); if (!filename) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; return file_getxattr(fd_file(f), &ctx); } else { int lookup_flags = 0; if (!(at_flags & AT_SYMLINK_NOFOLLOW)) lookup_flags = LOOKUP_FOLLOW; return filename_getxattr(dfd, filename, lookup_flags, &ctx); } } SYSCALL_DEFINE6(getxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, const char __user *, name, struct xattr_args __user *, uargs, size_t, usize) { struct xattr_args args = {}; int error; BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0); BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST); if (unlikely(usize < XATTR_ARGS_SIZE_VER0)) return -EINVAL; if (usize > PAGE_SIZE) return -E2BIG; error = copy_struct_from_user(&args, sizeof(args), uargs, usize); if (error) return error; if (args.flags != 0) return -EINVAL; return path_getxattrat(dfd, pathname, at_flags, name, u64_to_user_ptr(args.value), args.size); } SYSCALL_DEFINE4(getxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { return path_getxattrat(AT_FDCWD, pathname, 0, name, value, size); } SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { return path_getxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name, value, size); } SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, void __user *, value, size_t, size) { return path_getxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size); } /* * Extended attribute LIST operations */ static ssize_t listxattr(struct dentry *d, char __user *list, size_t size) { ssize_t error; char *klist = NULL; if (size) { if (size > XATTR_LIST_MAX) size = XATTR_LIST_MAX; klist = kvmalloc(size, GFP_KERNEL); if (!klist) return -ENOMEM; } error = vfs_listxattr(d, klist, size); if (error > 0) { if (size && copy_to_user(list, klist, error)) error = -EFAULT; } else if (error == -ERANGE && size >= XATTR_LIST_MAX) { /* The file system tried to returned a list bigger than XATTR_LIST_MAX bytes. Not possible. */ error = -E2BIG; } kvfree(klist); return error; } static ssize_t file_listxattr(struct file *f, char __user *list, size_t size) { audit_file(f); return listxattr(f->f_path.dentry, list, size); } /* unconditionally consumes filename */ static ssize_t filename_listxattr(int dfd, struct filename *filename, unsigned int lookup_flags, char __user *list, size_t size) { struct path path; ssize_t error; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = listxattr(path.dentry, list, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: putname(filename); return error; } static ssize_t path_listxattrat(int dfd, const char __user *pathname, unsigned int at_flags, char __user *list, size_t size) { struct filename *filename; int lookup_flags; if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; filename = getname_maybe_null(pathname, at_flags); if (!filename) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; return file_listxattr(fd_file(f), list, size); } lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; return filename_listxattr(dfd, filename, lookup_flags, list, size); } SYSCALL_DEFINE5(listxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, char __user *, list, size_t, size) { return path_listxattrat(dfd, pathname, at_flags, list, size); } SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list, size_t, size) { return path_listxattrat(AT_FDCWD, pathname, 0, list, size); } SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, size_t, size) { return path_listxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, list, size); } SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) { return path_listxattrat(fd, NULL, AT_EMPTY_PATH, list, size); } /* * Extended attribute REMOVE operations */ static long removexattr(struct mnt_idmap *idmap, struct dentry *d, const char *name) { if (is_posix_acl_xattr(name)) return vfs_remove_acl(idmap, d, name); return vfs_removexattr(idmap, d, name); } static int file_removexattr(struct file *f, struct xattr_name *kname) { int error = mnt_want_write_file(f); if (!error) { audit_file(f); error = removexattr(file_mnt_idmap(f), f->f_path.dentry, kname->name); mnt_drop_write_file(f); } return error; } /* unconditionally consumes filename */ static int filename_removexattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct xattr_name *kname) { struct path path; int error; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = mnt_want_write(path.mnt); if (!error) { error = removexattr(mnt_idmap(path.mnt), path.dentry, kname->name); mnt_drop_write(path.mnt); } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: putname(filename); return error; } static int path_removexattrat(int dfd, const char __user *pathname, unsigned int at_flags, const char __user *name) { struct xattr_name kname; struct filename *filename; unsigned int lookup_flags; int error; if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; error = import_xattr_name(&kname, name); if (error) return error; filename = getname_maybe_null(pathname, at_flags); if (!filename) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; return file_removexattr(fd_file(f), &kname); } lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; return filename_removexattr(dfd, filename, lookup_flags, &kname); } SYSCALL_DEFINE4(removexattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, const char __user *, name) { return path_removexattrat(dfd, pathname, at_flags, name); } SYSCALL_DEFINE2(removexattr, const char __user *, pathname, const char __user *, name) { return path_removexattrat(AT_FDCWD, pathname, 0, name); } SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, const char __user *, name) { return path_removexattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name); } SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) { return path_removexattrat(fd, NULL, AT_EMPTY_PATH, name); } int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name) { size_t len; len = strlen(name) + 1; if (*buffer) { if (*remaining_size < len) return -ERANGE; memcpy(*buffer, name, len); *buffer += len; } *remaining_size -= len; return 0; } /** * generic_listxattr - run through a dentry's xattr list() operations * @dentry: dentry to list the xattrs * @buffer: result buffer * @buffer_size: size of @buffer * * Combine the results of the list() operation from every xattr_handler in the * xattr_handler stack. * * Note that this will not include the entries for POSIX ACLs. */ ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr; ssize_t remaining_size = buffer_size; for_each_xattr_handler(handlers, handler) { int err; if (!handler->name || (handler->list && !handler->list(dentry))) continue; err = xattr_list_one(&buffer, &remaining_size, handler->name); if (err) return err; } return buffer_size - remaining_size; } EXPORT_SYMBOL(generic_listxattr); /** * xattr_full_name - Compute full attribute name from suffix * * @handler: handler of the xattr_handler operation * @name: name passed to the xattr_handler operation * * The get and set xattr handler operations are called with the remainder of * the attribute name after skipping the handler's prefix: for example, "foo" * is passed to the get operation of a handler with prefix "user." to get * attribute "user.foo". The full name is still "there" in the name though. * * Note: the list xattr handler operation when called from the vfs is passed a * NULL name; some file systems use this operation internally, with varying * semantics. */ const char *xattr_full_name(const struct xattr_handler *handler, const char *name) { size_t prefix_len = strlen(xattr_prefix(handler)); return name - prefix_len; } EXPORT_SYMBOL(xattr_full_name); /** * simple_xattr_space - estimate the memory used by a simple xattr * @name: the full name of the xattr * @size: the size of its value * * This takes no account of how much larger the two slab objects actually are: * that would depend on the slab implementation, when what is required is a * deterministic number, which grows with name length and size and quantity. * * Return: The approximate number of bytes of memory used by such an xattr. */ size_t simple_xattr_space(const char *name, size_t size) { /* * Use "40" instead of sizeof(struct simple_xattr), to return the * same result on 32-bit and 64-bit, and even if simple_xattr grows. */ return 40 + size + strlen(name); } /** * simple_xattr_free - free an xattr object * @xattr: the xattr object * * Free the xattr object. Can handle @xattr being NULL. */ void simple_xattr_free(struct simple_xattr *xattr) { if (xattr) kfree(xattr->name); kvfree(xattr); } /** * simple_xattr_alloc - allocate new xattr object * @value: value of the xattr object * @size: size of @value * * Allocate a new xattr object and initialize respective members. The caller is * responsible for handling the name of the xattr. * * Return: On success a new xattr object is returned. On failure NULL is * returned. */ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) { struct simple_xattr *new_xattr; size_t len; /* wrap around? */ len = sizeof(*new_xattr) + size; if (len < sizeof(*new_xattr)) return NULL; new_xattr = kvmalloc(len, GFP_KERNEL_ACCOUNT); if (!new_xattr) return NULL; new_xattr->size = size; memcpy(new_xattr->value, value, size); return new_xattr; } /** * rbtree_simple_xattr_cmp - compare xattr name with current rbtree xattr entry * @key: xattr name * @node: current node * * Compare the xattr name with the xattr name attached to @node in the rbtree. * * Return: Negative value if continuing left, positive if continuing right, 0 * if the xattr attached to @node matches @key. */ static int rbtree_simple_xattr_cmp(const void *key, const struct rb_node *node) { const char *xattr_name = key; const struct simple_xattr *xattr; xattr = rb_entry(node, struct simple_xattr, rb_node); return strcmp(xattr->name, xattr_name); } /** * rbtree_simple_xattr_node_cmp - compare two xattr rbtree nodes * @new_node: new node * @node: current node * * Compare the xattr attached to @new_node with the xattr attached to @node. * * Return: Negative value if continuing left, positive if continuing right, 0 * if the xattr attached to @new_node matches the xattr attached to @node. */ static int rbtree_simple_xattr_node_cmp(struct rb_node *new_node, const struct rb_node *node) { struct simple_xattr *xattr; xattr = rb_entry(new_node, struct simple_xattr, rb_node); return rbtree_simple_xattr_cmp(xattr->name, node); } /** * simple_xattr_get - get an xattr object * @xattrs: the header of the xattr object * @name: the name of the xattr to retrieve * @buffer: the buffer to store the value into * @size: the size of @buffer * * Try to find and retrieve the xattr object associated with @name. * If @buffer is provided store the value of @xattr in @buffer * otherwise just return the length. The size of @buffer is limited * to XATTR_SIZE_MAX which currently is 65536. * * Return: On success the length of the xattr value is returned. On error a * negative error code is returned. */ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size) { struct simple_xattr *xattr = NULL; struct rb_node *rbp; int ret = -ENODATA; read_lock(&xattrs->lock); rbp = rb_find(name, &xattrs->rb_root, rbtree_simple_xattr_cmp); if (rbp) { xattr = rb_entry(rbp, struct simple_xattr, rb_node); ret = xattr->size; if (buffer) { if (size < xattr->size) ret = -ERANGE; else memcpy(buffer, xattr->value, xattr->size); } } read_unlock(&xattrs->lock); return ret; } /** * simple_xattr_set - set an xattr object * @xattrs: the header of the xattr object * @name: the name of the xattr to retrieve * @value: the value to store along the xattr * @size: the size of @value * @flags: the flags determining how to set the xattr * * Set a new xattr object. * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE * is specified in @flags a matching xattr object for @name must already exist. * If it does it will be replaced with the new xattr object. If it doesn't we * fail. If XATTR_CREATE is specified and a matching xattr does already exist * we fail. If it doesn't we create a new xattr. If @flags is zero we simply * insert the new xattr replacing any existing one. * * If @value is empty and a matching xattr object is found we delete it if * XATTR_REPLACE is specified in @flags or @flags is zero. * * If @value is empty and no matching xattr object for @name is found we do * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For * XATTR_REPLACE we fail as mentioned above. * * Return: On success, the removed or replaced xattr is returned, to be freed * by the caller; or NULL if none. On failure a negative error code is returned. */ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags) { struct simple_xattr *old_xattr = NULL, *new_xattr = NULL; struct rb_node *parent = NULL, **rbp; int err = 0, ret; /* value == NULL means remove */ if (value) { new_xattr = simple_xattr_alloc(value, size); if (!new_xattr) return ERR_PTR(-ENOMEM); new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { simple_xattr_free(new_xattr); return ERR_PTR(-ENOMEM); } } write_lock(&xattrs->lock); rbp = &xattrs->rb_root.rb_node; while (*rbp) { parent = *rbp; ret = rbtree_simple_xattr_cmp(name, *rbp); if (ret < 0) rbp = &(*rbp)->rb_left; else if (ret > 0) rbp = &(*rbp)->rb_right; else old_xattr = rb_entry(*rbp, struct simple_xattr, rb_node); if (old_xattr) break; } if (old_xattr) { /* Fail if XATTR_CREATE is requested and the xattr exists. */ if (flags & XATTR_CREATE) { err = -EEXIST; goto out_unlock; } if (new_xattr) rb_replace_node(&old_xattr->rb_node, &new_xattr->rb_node, &xattrs->rb_root); else rb_erase(&old_xattr->rb_node, &xattrs->rb_root); } else { /* Fail if XATTR_REPLACE is requested but no xattr is found. */ if (flags & XATTR_REPLACE) { err = -ENODATA; goto out_unlock; } /* * If XATTR_CREATE or no flags are specified together with a * new value simply insert it. */ if (new_xattr) { rb_link_node(&new_xattr->rb_node, parent, rbp); rb_insert_color(&new_xattr->rb_node, &xattrs->rb_root); } /* * If XATTR_CREATE or no flags are specified and neither an * old or new xattr exist then we don't need to do anything. */ } out_unlock: write_unlock(&xattrs->lock); if (!err) return old_xattr; simple_xattr_free(new_xattr); return ERR_PTR(err); } static bool xattr_is_trusted(const char *name) { return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } /** * simple_xattr_list - list all xattr objects * @inode: inode from which to get the xattrs * @xattrs: the header of the xattr object * @buffer: the buffer to store all xattrs into * @size: the size of @buffer * * List all xattrs associated with @inode. If @buffer is NULL we returned * the required size of the buffer. If @buffer is provided we store the * xattrs value into it provided it is big enough. * * Note, the number of xattr names that can be listed with listxattr(2) is * limited to XATTR_LIST_MAX aka 65536 bytes. If a larger buffer is passed * then vfs_listxattr() caps it to XATTR_LIST_MAX and if more xattr names * are found it will return -E2BIG. * * Return: On success the required size or the size of the copied xattrs is * returned. On error a negative error code is returned. */ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size) { bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); struct simple_xattr *xattr; struct rb_node *rbp; ssize_t remaining_size = size; int err = 0; err = posix_acl_listxattr(inode, &buffer, &remaining_size); if (err) return err; read_lock(&xattrs->lock); for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { xattr = rb_entry(rbp, struct simple_xattr, rb_node); /* skip "trusted." attributes for unprivileged callers */ if (!trusted && xattr_is_trusted(xattr->name)) continue; err = xattr_list_one(&buffer, &remaining_size, xattr->name); if (err) break; } read_unlock(&xattrs->lock); return err ? err : size - remaining_size; } /** * rbtree_simple_xattr_less - compare two xattr rbtree nodes * @new_node: new node * @node: current node * * Compare the xattr attached to @new_node with the xattr attached to @node. * Note that this function technically tolerates duplicate entries. * * Return: True if insertion point in the rbtree is found. */ static bool rbtree_simple_xattr_less(struct rb_node *new_node, const struct rb_node *node) { return rbtree_simple_xattr_node_cmp(new_node, node) < 0; } /** * simple_xattr_add - add xattr objects * @xattrs: the header of the xattr object * @new_xattr: the xattr object to add * * Add an xattr object to @xattrs. This assumes no replacement or removal * of matching xattrs is wanted. Should only be called during inode * initialization when a few distinct initial xattrs are supposed to be set. */ void simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr) { write_lock(&xattrs->lock); rb_add(&new_xattr->rb_node, &xattrs->rb_root, rbtree_simple_xattr_less); write_unlock(&xattrs->lock); } /** * simple_xattrs_init - initialize new xattr header * @xattrs: header to initialize * * Initialize relevant fields of a an xattr header. */ void simple_xattrs_init(struct simple_xattrs *xattrs) { xattrs->rb_root = RB_ROOT; rwlock_init(&xattrs->lock); } /** * simple_xattrs_free - free xattrs * @xattrs: xattr header whose xattrs to destroy * @freed_space: approximate number of bytes of memory freed from @xattrs * * Destroy all xattrs in @xattr. When this is called no one can hold a * reference to any of the xattrs anymore. */ void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space) { struct rb_node *rbp; if (freed_space) *freed_space = 0; rbp = rb_first(&xattrs->rb_root); while (rbp) { struct simple_xattr *xattr; struct rb_node *rbp_next; rbp_next = rb_next(rbp); xattr = rb_entry(rbp, struct simple_xattr, rb_node); rb_erase(&xattr->rb_node, &xattrs->rb_root); if (freed_space) *freed_space += simple_xattr_space(xattr->name, xattr->size); simple_xattr_free(xattr); rbp = rbp_next; } } |
| 21 21 1 21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | // SPDX-License-Identifier: GPL-2.0-or-later /* * The ChaCha stream cipher (RFC7539) * * Copyright (C) 2015 Martin Willi */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/module.h> #include <crypto/algapi.h> // for crypto_xor_cpy #include <crypto/chacha.h> void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { /* aligned to potentially speed up crypto_xor() */ u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long)); while (bytes >= CHACHA_BLOCK_SIZE) { chacha_block_generic(state, stream, nrounds); crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE); bytes -= CHACHA_BLOCK_SIZE; dst += CHACHA_BLOCK_SIZE; src += CHACHA_BLOCK_SIZE; } if (bytes) { chacha_block_generic(state, stream, nrounds); crypto_xor_cpy(dst, src, stream, bytes); } } EXPORT_SYMBOL(chacha_crypt_generic); MODULE_DESCRIPTION("ChaCha stream cipher (RFC7539)"); MODULE_LICENSE("GPL"); |
| 4 3 2 2 126 126 136 3035 3036 3040 3036 70 142 3034 3031 3035 3034 2969 69 3036 3030 3017 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 | // SPDX-License-Identifier: GPL-2.0 /* * USB device quirk handling logic and table * * Copyright (c) 2007 Oliver Neukum * Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de> */ #include <linux/moduleparam.h> #include <linux/usb.h> #include <linux/usb/quirks.h> #include <linux/usb/hcd.h> #include "usb.h" struct quirk_entry { u16 vid; u16 pid; u32 flags; }; static DEFINE_MUTEX(quirk_mutex); static struct quirk_entry *quirk_list; static unsigned int quirk_count; static char quirks_param[128]; static int quirks_param_set(const char *value, const struct kernel_param *kp) { char *val, *p, *field; u16 vid, pid; u32 flags; size_t i; int err; val = kstrdup(value, GFP_KERNEL); if (!val) return -ENOMEM; err = param_set_copystring(val, kp); if (err) { kfree(val); return err; } mutex_lock(&quirk_mutex); if (!*val) { quirk_count = 0; kfree(quirk_list); quirk_list = NULL; goto unlock; } for (quirk_count = 1, i = 0; val[i]; i++) if (val[i] == ',') quirk_count++; if (quirk_list) { kfree(quirk_list); quirk_list = NULL; } quirk_list = kcalloc(quirk_count, sizeof(struct quirk_entry), GFP_KERNEL); if (!quirk_list) { quirk_count = 0; mutex_unlock(&quirk_mutex); kfree(val); return -ENOMEM; } for (i = 0, p = val; p && *p;) { /* Each entry consists of VID:PID:flags */ field = strsep(&p, ":"); if (!field) break; if (kstrtou16(field, 16, &vid)) break; field = strsep(&p, ":"); if (!field) break; if (kstrtou16(field, 16, &pid)) break; field = strsep(&p, ","); if (!field || !*field) break; /* Collect the flags */ for (flags = 0; *field; field++) { switch (*field) { case 'a': flags |= USB_QUIRK_STRING_FETCH_255; break; case 'b': flags |= USB_QUIRK_RESET_RESUME; break; case 'c': flags |= USB_QUIRK_NO_SET_INTF; break; case 'd': flags |= USB_QUIRK_CONFIG_INTF_STRINGS; break; case 'e': flags |= USB_QUIRK_RESET; break; case 'f': flags |= USB_QUIRK_HONOR_BNUMINTERFACES; break; case 'g': flags |= USB_QUIRK_DELAY_INIT; break; case 'h': flags |= USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL; break; case 'i': flags |= USB_QUIRK_DEVICE_QUALIFIER; break; case 'j': flags |= USB_QUIRK_IGNORE_REMOTE_WAKEUP; break; case 'k': flags |= USB_QUIRK_NO_LPM; break; case 'l': flags |= USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL; break; case 'm': flags |= USB_QUIRK_DISCONNECT_SUSPEND; break; case 'n': flags |= USB_QUIRK_DELAY_CTRL_MSG; break; case 'o': flags |= USB_QUIRK_HUB_SLOW_RESET; break; case 'p': flags |= USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT; break; /* Ignore unrecognized flag characters */ } } quirk_list[i++] = (struct quirk_entry) { .vid = vid, .pid = pid, .flags = flags }; } if (i < quirk_count) quirk_count = i; unlock: mutex_unlock(&quirk_mutex); kfree(val); return 0; } static const struct kernel_param_ops quirks_param_ops = { .set = quirks_param_set, .get = param_get_string, }; static struct kparam_string quirks_param_string = { .maxlen = sizeof(quirks_param), .string = quirks_param, }; device_param_cb(quirks, &quirks_param_ops, &quirks_param_string, 0644); MODULE_PARM_DESC(quirks, "Add/modify USB quirks by specifying quirks=vendorID:productID:quirks"); /* Lists of quirky USB devices, split in device quirks and interface quirks. * Device quirks are applied at the very beginning of the enumeration process, * right after reading the device descriptor. They can thus only match on device * information. * * Interface quirks are applied after reading all the configuration descriptors. * They can match on both device and interface information. * * Note that the DELAY_INIT and HONOR_BNUMINTERFACES quirks do not make sense as * interface quirks, as they only influence the enumeration process which is run * before processing the interface quirks. * * Please keep the lists ordered by: * 1) Vendor ID * 2) Product ID * 3) Class ID */ static const struct usb_device_id usb_quirk_list[] = { /* CBM - Flash disk */ { USB_DEVICE(0x0204, 0x6025), .driver_info = USB_QUIRK_RESET_RESUME }, /* WORLDE Controller KS49 or Prodipe MIDI 49C USB controller */ { USB_DEVICE(0x0218, 0x0201), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* WORLDE easy key (easykey.25) MIDI controller */ { USB_DEVICE(0x0218, 0x0401), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* HP 5300/5370C scanner */ { USB_DEVICE(0x03f0, 0x0701), .driver_info = USB_QUIRK_STRING_FETCH_255 }, /* HP v222w 16GB Mini USB Drive */ { USB_DEVICE(0x03f0, 0x3f40), .driver_info = USB_QUIRK_DELAY_INIT }, /* Creative SB Audigy 2 NX */ { USB_DEVICE(0x041e, 0x3020), .driver_info = USB_QUIRK_RESET_RESUME }, /* USB3503 */ { USB_DEVICE(0x0424, 0x3503), .driver_info = USB_QUIRK_RESET_RESUME }, /* Microsoft Wireless Laser Mouse 6000 Receiver */ { USB_DEVICE(0x045e, 0x00e1), .driver_info = USB_QUIRK_RESET_RESUME }, /* Microsoft LifeCam-VX700 v2.0 */ { USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME }, /* Microsoft Surface Dock Ethernet (RTL8153 GigE) */ { USB_DEVICE(0x045e, 0x07c6), .driver_info = USB_QUIRK_NO_LPM }, /* Cherry Stream G230 2.0 (G85-231) and 3.0 (G85-232) */ { USB_DEVICE(0x046a, 0x0023), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech HD Webcam C270 */ { USB_DEVICE(0x046d, 0x0825), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech HD Pro Webcams C920, C920-C, C922, C925e and C930e */ { USB_DEVICE(0x046d, 0x082d), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x0841), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x0843), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x085b), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x085c), .driver_info = USB_QUIRK_DELAY_INIT }, /* Logitech ConferenceCam CC3000e */ { USB_DEVICE(0x046d, 0x0847), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x0848), .driver_info = USB_QUIRK_DELAY_INIT }, /* Logitech PTZ Pro Camera */ { USB_DEVICE(0x046d, 0x0853), .driver_info = USB_QUIRK_DELAY_INIT }, /* Logitech Screen Share */ { USB_DEVICE(0x046d, 0x086c), .driver_info = USB_QUIRK_NO_LPM }, /* Logitech Quickcam Fusion */ { USB_DEVICE(0x046d, 0x08c1), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Quickcam Orbit MP */ { USB_DEVICE(0x046d, 0x08c2), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Quickcam Pro for Notebook */ { USB_DEVICE(0x046d, 0x08c3), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Quickcam Pro 5000 */ { USB_DEVICE(0x046d, 0x08c5), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Quickcam OEM Dell Notebook */ { USB_DEVICE(0x046d, 0x08c6), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Quickcam OEM Cisco VT Camera II */ { USB_DEVICE(0x046d, 0x08c7), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Harmony 700-series */ { USB_DEVICE(0x046d, 0xc122), .driver_info = USB_QUIRK_DELAY_INIT }, /* Philips PSC805 audio device */ { USB_DEVICE(0x0471, 0x0155), .driver_info = USB_QUIRK_RESET_RESUME }, /* Plantronic Audio 655 DSP */ { USB_DEVICE(0x047f, 0xc008), .driver_info = USB_QUIRK_RESET_RESUME }, /* Plantronic Audio 648 USB */ { USB_DEVICE(0x047f, 0xc013), .driver_info = USB_QUIRK_RESET_RESUME }, /* Artisman Watchdog Dongle */ { USB_DEVICE(0x04b4, 0x0526), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Microchip Joss Optical infrared touchboard device */ { USB_DEVICE(0x04d8, 0x000c), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* CarrolTouch 4000U */ { USB_DEVICE(0x04e7, 0x0009), .driver_info = USB_QUIRK_RESET_RESUME }, /* CarrolTouch 4500U */ { USB_DEVICE(0x04e7, 0x0030), .driver_info = USB_QUIRK_RESET_RESUME }, /* Samsung Android phone modem - ID conflict with SPH-I500 */ { USB_DEVICE(0x04e8, 0x6601), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Elan Touchscreen */ { USB_DEVICE(0x04f3, 0x0089), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, { USB_DEVICE(0x04f3, 0x009b), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, { USB_DEVICE(0x04f3, 0x010c), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, { USB_DEVICE(0x04f3, 0x0125), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, { USB_DEVICE(0x04f3, 0x016f), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, { USB_DEVICE(0x04f3, 0x0381), .driver_info = USB_QUIRK_NO_LPM }, { USB_DEVICE(0x04f3, 0x21b8), .driver_info = USB_QUIRK_DEVICE_QUALIFIER }, /* Roland SC-8820 */ { USB_DEVICE(0x0582, 0x0007), .driver_info = USB_QUIRK_RESET_RESUME }, /* Edirol SD-20 */ { USB_DEVICE(0x0582, 0x0027), .driver_info = USB_QUIRK_RESET_RESUME }, /* Alcor Micro Corp. Hub */ { USB_DEVICE(0x058f, 0x9254), .driver_info = USB_QUIRK_RESET_RESUME }, /* appletouch */ { USB_DEVICE(0x05ac, 0x021a), .driver_info = USB_QUIRK_RESET_RESUME }, /* Genesys Logic hub, internally used by KY-688 USB 3.1 Type-C Hub */ { USB_DEVICE(0x05e3, 0x0612), .driver_info = USB_QUIRK_NO_LPM }, /* ELSA MicroLink 56K */ { USB_DEVICE(0x05cc, 0x2267), .driver_info = USB_QUIRK_RESET_RESUME }, /* Genesys Logic hub, internally used by Moshi USB to Ethernet Adapter */ { USB_DEVICE(0x05e3, 0x0616), .driver_info = USB_QUIRK_NO_LPM }, /* Avision AV600U */ { USB_DEVICE(0x0638, 0x0a13), .driver_info = USB_QUIRK_STRING_FETCH_255 }, /* Saitek Cyborg Gold Joystick */ { USB_DEVICE(0x06a3, 0x0006), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Agfa SNAPSCAN 1212U */ { USB_DEVICE(0x06bd, 0x0001), .driver_info = USB_QUIRK_RESET_RESUME }, /* Guillemot Webcam Hercules Dualpix Exchange (2nd ID) */ { USB_DEVICE(0x06f8, 0x0804), .driver_info = USB_QUIRK_RESET_RESUME }, /* Guillemot Webcam Hercules Dualpix Exchange*/ { USB_DEVICE(0x06f8, 0x3005), .driver_info = USB_QUIRK_RESET_RESUME }, /* Guillemot Hercules DJ Console audio card (BZ 208357) */ { USB_DEVICE(0x06f8, 0xb000), .driver_info = USB_QUIRK_ENDPOINT_IGNORE }, /* Midiman M-Audio Keystation 88es */ { USB_DEVICE(0x0763, 0x0192), .driver_info = USB_QUIRK_RESET_RESUME }, /* SanDisk Ultra Fit and Ultra Flair */ { USB_DEVICE(0x0781, 0x5583), .driver_info = USB_QUIRK_NO_LPM }, { USB_DEVICE(0x0781, 0x5591), .driver_info = USB_QUIRK_NO_LPM }, /* Realforce 87U Keyboard */ { USB_DEVICE(0x0853, 0x011b), .driver_info = USB_QUIRK_NO_LPM }, /* M-Systems Flash Disk Pioneers */ { USB_DEVICE(0x08ec, 0x1000), .driver_info = USB_QUIRK_RESET_RESUME }, /* Baum Vario Ultra */ { USB_DEVICE(0x0904, 0x6101), .driver_info = USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL }, { USB_DEVICE(0x0904, 0x6102), .driver_info = USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL }, { USB_DEVICE(0x0904, 0x6103), .driver_info = USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL }, /* Sound Devices USBPre2 */ { USB_DEVICE(0x0926, 0x0202), .driver_info = USB_QUIRK_ENDPOINT_IGNORE }, /* Sound Devices MixPre-D */ { USB_DEVICE(0x0926, 0x0208), .driver_info = USB_QUIRK_ENDPOINT_IGNORE }, /* Keytouch QWERTY Panel keyboard */ { USB_DEVICE(0x0926, 0x3333), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Kingston DataTraveler 3.0 */ { USB_DEVICE(0x0951, 0x1666), .driver_info = USB_QUIRK_NO_LPM }, /* TOSHIBA TransMemory-Mx */ { USB_DEVICE(0x0930, 0x1408), .driver_info = USB_QUIRK_NO_LPM }, /* NVIDIA Jetson devices in Force Recovery mode */ { USB_DEVICE(0x0955, 0x7018), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7019), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7418), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7721), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7c18), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7e19), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x0955, 0x7f21), .driver_info = USB_QUIRK_RESET_RESUME }, /* X-Rite/Gretag-Macbeth Eye-One Pro display colorimeter */ { USB_DEVICE(0x0971, 0x2000), .driver_info = USB_QUIRK_NO_SET_INTF }, /* ELMO L-12F document camera */ { USB_DEVICE(0x09a1, 0x0028), .driver_info = USB_QUIRK_DELAY_CTRL_MSG }, /* Broadcom BCM92035DGROM BT dongle */ { USB_DEVICE(0x0a5c, 0x2021), .driver_info = USB_QUIRK_RESET_RESUME }, /* MAYA44USB sound device */ { USB_DEVICE(0x0a92, 0x0091), .driver_info = USB_QUIRK_RESET_RESUME }, /* ASUS Base Station(T100) */ { USB_DEVICE(0x0b05, 0x17e0), .driver_info = USB_QUIRK_IGNORE_REMOTE_WAKEUP }, /* Realtek Semiconductor Corp. Mass Storage Device (Multicard Reader)*/ { USB_DEVICE(0x0bda, 0x0151), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Realtek hub in Dell WD19 (Type-C) */ { USB_DEVICE(0x0bda, 0x0487), .driver_info = USB_QUIRK_NO_LPM }, /* Generic RTL8153 based ethernet adapters */ { USB_DEVICE(0x0bda, 0x8153), .driver_info = USB_QUIRK_NO_LPM }, /* SONiX USB DEVICE Touchpad */ { USB_DEVICE(0x0c45, 0x7056), .driver_info = USB_QUIRK_IGNORE_REMOTE_WAKEUP }, /* Sony Xperia XZ1 Compact (lilac) smartphone in fastboot mode */ { USB_DEVICE(0x0fce, 0x0dde), .driver_info = USB_QUIRK_NO_LPM }, /* Action Semiconductor flash disk */ { USB_DEVICE(0x10d6, 0x2200), .driver_info = USB_QUIRK_STRING_FETCH_255 }, /* novation SoundControl XL */ { USB_DEVICE(0x1235, 0x0061), .driver_info = USB_QUIRK_RESET_RESUME }, /* Focusrite Scarlett Solo USB */ { USB_DEVICE(0x1235, 0x8211), .driver_info = USB_QUIRK_DISCONNECT_SUSPEND }, /* Huawei 4G LTE module */ { USB_DEVICE(0x12d1, 0x15bb), .driver_info = USB_QUIRK_DISCONNECT_SUSPEND }, { USB_DEVICE(0x12d1, 0x15c3), .driver_info = USB_QUIRK_DISCONNECT_SUSPEND }, /* SKYMEDI USB_DRIVE */ { USB_DEVICE(0x1516, 0x8628), .driver_info = USB_QUIRK_RESET_RESUME }, /* Razer - Razer Blade Keyboard */ { USB_DEVICE(0x1532, 0x0116), .driver_info = USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL }, /* Lenovo ThinkPad OneLink+ Dock twin hub controllers (VIA Labs VL812) */ { USB_DEVICE(0x17ef, 0x1018), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x17ef, 0x1019), .driver_info = USB_QUIRK_RESET_RESUME }, /* Lenovo USB-C to Ethernet Adapter RTL8153-04 */ { USB_DEVICE(0x17ef, 0x720c), .driver_info = USB_QUIRK_NO_LPM }, /* Lenovo Powered USB-C Travel Hub (4X90S92381, RTL8153 GigE) */ { USB_DEVICE(0x17ef, 0x721e), .driver_info = USB_QUIRK_NO_LPM }, /* Lenovo ThinkCenter A630Z TI024Gen3 usb-audio */ { USB_DEVICE(0x17ef, 0xa012), .driver_info = USB_QUIRK_DISCONNECT_SUSPEND }, /* Lenovo ThinkPad USB-C Dock Gen2 Ethernet (RTL8153 GigE) */ { USB_DEVICE(0x17ef, 0xa387), .driver_info = USB_QUIRK_NO_LPM }, /* BUILDWIN Photo Frame */ { USB_DEVICE(0x1908, 0x1315), .driver_info = USB_QUIRK_HONOR_BNUMINTERFACES }, /* Protocol and OTG Electrical Test Device */ { USB_DEVICE(0x1a0a, 0x0200), .driver_info = USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL }, /* Terminus Technology Inc. Hub */ { USB_DEVICE(0x1a40, 0x0101), .driver_info = USB_QUIRK_HUB_SLOW_RESET }, /* Corsair K70 RGB */ { USB_DEVICE(0x1b1c, 0x1b13), .driver_info = USB_QUIRK_DELAY_INIT | USB_QUIRK_DELAY_CTRL_MSG }, /* Corsair Strafe */ { USB_DEVICE(0x1b1c, 0x1b15), .driver_info = USB_QUIRK_DELAY_INIT | USB_QUIRK_DELAY_CTRL_MSG }, /* Corsair Strafe RGB */ { USB_DEVICE(0x1b1c, 0x1b20), .driver_info = USB_QUIRK_DELAY_INIT | USB_QUIRK_DELAY_CTRL_MSG }, /* Corsair K70 LUX RGB */ { USB_DEVICE(0x1b1c, 0x1b33), .driver_info = USB_QUIRK_DELAY_INIT }, /* Corsair K70 LUX */ { USB_DEVICE(0x1b1c, 0x1b36), .driver_info = USB_QUIRK_DELAY_INIT }, /* Corsair K70 RGB RAPDIFIRE */ { USB_DEVICE(0x1b1c, 0x1b38), .driver_info = USB_QUIRK_DELAY_INIT | USB_QUIRK_DELAY_CTRL_MSG }, /* START BP-850k Printer */ { USB_DEVICE(0x1bc3, 0x0003), .driver_info = USB_QUIRK_NO_SET_INTF }, /* MIDI keyboard WORLDE MINI */ { USB_DEVICE(0x1c75, 0x0204), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Acer C120 LED Projector */ { USB_DEVICE(0x1de1, 0xc102), .driver_info = USB_QUIRK_NO_LPM }, /* Blackmagic Design Intensity Shuttle */ { USB_DEVICE(0x1edb, 0xbd3b), .driver_info = USB_QUIRK_NO_LPM }, /* Blackmagic Design UltraStudio SDI */ { USB_DEVICE(0x1edb, 0xbd4f), .driver_info = USB_QUIRK_NO_LPM }, /* Teclast disk */ { USB_DEVICE(0x1f75, 0x0917), .driver_info = USB_QUIRK_NO_LPM }, /* Hauppauge HVR-950q */ { USB_DEVICE(0x2040, 0x7200), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* Raydium Touchscreen */ { USB_DEVICE(0x2386, 0x3114), .driver_info = USB_QUIRK_NO_LPM }, { USB_DEVICE(0x2386, 0x3119), .driver_info = USB_QUIRK_NO_LPM }, { USB_DEVICE(0x2386, 0x350e), .driver_info = USB_QUIRK_NO_LPM }, /* APTIV AUTOMOTIVE HUB */ { USB_DEVICE(0x2c48, 0x0132), .driver_info = USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT }, /* DJI CineSSD */ { USB_DEVICE(0x2ca3, 0x0031), .driver_info = USB_QUIRK_NO_LPM }, /* Alcor Link AK9563 SC Reader used in 2022 Lenovo ThinkPads */ { USB_DEVICE(0x2ce3, 0x9563), .driver_info = USB_QUIRK_NO_LPM }, /* DELL USB GEN2 */ { USB_DEVICE(0x413c, 0xb062), .driver_info = USB_QUIRK_NO_LPM | USB_QUIRK_RESET_RESUME }, /* VCOM device */ { USB_DEVICE(0x4296, 0x7570), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, /* INTEL VALUE SSD */ { USB_DEVICE(0x8086, 0xf1a5), .driver_info = USB_QUIRK_RESET_RESUME }, { } /* terminating entry must be last */ }; static const struct usb_device_id usb_interface_quirk_list[] = { /* Logitech UVC Cameras */ { USB_VENDOR_AND_INTERFACE_INFO(0x046d, USB_CLASS_VIDEO, 1, 0), .driver_info = USB_QUIRK_RESET_RESUME }, { } /* terminating entry must be last */ }; static const struct usb_device_id usb_amd_resume_quirk_list[] = { /* Lenovo Mouse with Pixart controller */ { USB_DEVICE(0x17ef, 0x602e), .driver_info = USB_QUIRK_RESET_RESUME }, /* Pixart Mouse */ { USB_DEVICE(0x093a, 0x2500), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x093a, 0x2510), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x093a, 0x2521), .driver_info = USB_QUIRK_RESET_RESUME }, { USB_DEVICE(0x03f0, 0x2b4a), .driver_info = USB_QUIRK_RESET_RESUME }, /* Logitech Optical Mouse M90/M100 */ { USB_DEVICE(0x046d, 0xc05a), .driver_info = USB_QUIRK_RESET_RESUME }, { } /* terminating entry must be last */ }; /* * Entries for endpoints that should be ignored when parsing configuration * descriptors. * * Matched for devices with USB_QUIRK_ENDPOINT_IGNORE. */ static const struct usb_device_id usb_endpoint_ignore[] = { { USB_DEVICE_INTERFACE_NUMBER(0x06f8, 0xb000, 5), .driver_info = 0x01 }, { USB_DEVICE_INTERFACE_NUMBER(0x06f8, 0xb000, 5), .driver_info = 0x81 }, { USB_DEVICE_INTERFACE_NUMBER(0x0926, 0x0202, 1), .driver_info = 0x85 }, { USB_DEVICE_INTERFACE_NUMBER(0x0926, 0x0208, 1), .driver_info = 0x85 }, { } }; bool usb_endpoint_is_ignored(struct usb_device *udev, struct usb_host_interface *intf, struct usb_endpoint_descriptor *epd) { const struct usb_device_id *id; unsigned int address; for (id = usb_endpoint_ignore; id->match_flags; ++id) { if (!usb_match_device(udev, id)) continue; if (!usb_match_one_id_intf(udev, intf, id)) continue; address = id->driver_info; if (address == epd->bEndpointAddress) return true; } return false; } static bool usb_match_any_interface(struct usb_device *udev, const struct usb_device_id *id) { unsigned int i; for (i = 0; i < udev->descriptor.bNumConfigurations; ++i) { struct usb_host_config *cfg = &udev->config[i]; unsigned int j; for (j = 0; j < cfg->desc.bNumInterfaces; ++j) { struct usb_interface_cache *cache; struct usb_host_interface *intf; cache = cfg->intf_cache[j]; if (cache->num_altsetting == 0) continue; intf = &cache->altsetting[0]; if (usb_match_one_id_intf(udev, intf, id)) return true; } } return false; } static int usb_amd_resume_quirk(struct usb_device *udev) { struct usb_hcd *hcd; hcd = bus_to_hcd(udev->bus); /* The device should be attached directly to root hub */ if (udev->level == 1 && hcd->amd_resume_bug == 1) return 1; return 0; } static u32 usb_detect_static_quirks(struct usb_device *udev, const struct usb_device_id *id) { u32 quirks = 0; for (; id->match_flags; id++) { if (!usb_match_device(udev, id)) continue; if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_INFO) && !usb_match_any_interface(udev, id)) continue; quirks |= (u32)(id->driver_info); } return quirks; } static u32 usb_detect_dynamic_quirks(struct usb_device *udev) { u16 vid = le16_to_cpu(udev->descriptor.idVendor); u16 pid = le16_to_cpu(udev->descriptor.idProduct); int i, flags = 0; mutex_lock(&quirk_mutex); for (i = 0; i < quirk_count; i++) { if (vid == quirk_list[i].vid && pid == quirk_list[i].pid) { flags = quirk_list[i].flags; break; } } mutex_unlock(&quirk_mutex); return flags; } /* * Detect any quirks the device has, and do any housekeeping for it if needed. */ void usb_detect_quirks(struct usb_device *udev) { udev->quirks = usb_detect_static_quirks(udev, usb_quirk_list); /* * Pixart-based mice would trigger remote wakeup issue on AMD * Yangtze chipset, so set them as RESET_RESUME flag. */ if (usb_amd_resume_quirk(udev)) udev->quirks |= usb_detect_static_quirks(udev, usb_amd_resume_quirk_list); udev->quirks ^= usb_detect_dynamic_quirks(udev); if (udev->quirks) dev_dbg(&udev->dev, "USB quirks for this device: %x\n", udev->quirks); #ifdef CONFIG_USB_DEFAULT_PERSIST if (!(udev->quirks & USB_QUIRK_RESET)) udev->persist_enabled = 1; #else /* Hubs are automatically enabled for USB-PERSIST */ if (udev->descriptor.bDeviceClass == USB_CLASS_HUB) udev->persist_enabled = 1; #endif /* CONFIG_USB_DEFAULT_PERSIST */ } void usb_detect_interface_quirks(struct usb_device *udev) { u32 quirks; quirks = usb_detect_static_quirks(udev, usb_interface_quirk_list); if (quirks == 0) return; dev_dbg(&udev->dev, "USB interface quirks for this device: %x\n", quirks); udev->quirks |= quirks; } void usb_release_quirk_list(void) { mutex_lock(&quirk_mutex); kfree(quirk_list); quirk_list = NULL; mutex_unlock(&quirk_mutex); } |
| 1554 1553 1550 38 38 38 99 85 63 77 52 41 81 81 82 82 71 71 71 33 8430 8433 1202 7877 6 8435 8418 14 14 5704 5701 2 5709 7804 19 7795 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 | // SPDX-License-Identifier: GPL-2.0 /* * mm/pgtable-generic.c * * Generic pgtable methods declared in linux/pgtable.h * * Copyright (C) 2010 Linus Torvalds */ #include <linux/pagemap.h> #include <linux/hugetlb.h> #include <linux/pgtable.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mm_inline.h> #include <asm/pgalloc.h> #include <asm/tlb.h> /* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but * very seldom) called out from the p?d_none_or_clear_bad macros. */ void pgd_clear_bad(pgd_t *pgd) { pgd_ERROR(*pgd); pgd_clear(pgd); } #ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *p4d) { p4d_ERROR(*p4d); p4d_clear(p4d); } #endif #ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *pud) { pud_ERROR(*pud); pud_clear(pud); } #endif /* * Note that the pmd variant below can't be stub'ed out just as for p4d/pud * above. pmd folding is special and typically pmd_* macros refer to upper * level even when folded */ void pmd_clear_bad(pmd_t *pmd) { pmd_ERROR(*pmd); pmd_clear(pmd); } #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* * Only sets the access flags (dirty, accessed), as well as write * permission. Furthermore, we know it always gets set to a "more * permissive" setting, which allows most architectures to optimize * this. We return whether the PTE actually changed, which in turn * instructs the caller to do things like update__mmu_cache. This * used to be done in the caller, but sparc needs minor faults to * force that call on sun4c so we changed this macro slightly */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed = !pte_same(ptep_get(ptep), entry); if (changed) { set_pte_at(vma->vm_mm, address, ptep, entry); flush_tlb_fix_spurious_fault(vma, address, ptep); } return changed; } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { int young; young = ptep_test_and_clear_young(vma, address, ptep); if (young) flush_tlb_page(vma, address); return young; } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { struct mm_struct *mm = (vma)->vm_mm; pte_t pte; pte = ptep_get_and_clear(mm, address, ptep); if (pte_accessible(mm, pte)) flush_tlb_page(vma, address); return pte; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { int changed = !pmd_same(*pmdp, entry); VM_BUG_ON(address & ~HPAGE_PMD_MASK); if (changed) { set_pmd_at(vma->vm_mm, address, pmdp, entry); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } return changed; } #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { int young; VM_BUG_ON(address & ~HPAGE_PMD_MASK); young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return young; } #endif #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t *pudp) { pud_t pud; VM_BUG_ON(address & ~HPAGE_PUD_MASK); VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp)); pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp); flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); return pud; } #endif #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ if (!pmd_huge_pte(mm, pmdp)) INIT_LIST_HEAD(&pgtable->lru); else list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); pmd_huge_pte(mm, pmdp) = pgtable; } #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW /* no "address" argument so destroys page coloring of some arch */ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ pgtable = pmd_huge_pte(mm, pmdp); pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru, struct page, lru); if (pmd_huge_pte(mm, pmdp)) list_del(&pgtable->lru); return pgtable; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { VM_WARN_ON_ONCE(!pmd_present(*pmdp)); pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return old; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { VM_WARN_ON_ONCE(!pmd_present(*pmdp)); return pmdp_invalidate(vma, address, pmdp); } #endif #ifndef pmdp_collapse_flush pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { /* * pmd and hugepage pte format are same. So we could * use the same function. */ pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(pmd_trans_huge(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); /* collapse entails shooting down ptes not pmd */ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } #endif /* arch define pte_free_defer in asm/pgalloc.h for its own implementation */ #ifndef pte_free_defer static void pte_free_now(struct rcu_head *head) { struct page *page; page = container_of(head, struct page, rcu_head); pte_free(NULL /* mm not passed and not used */, (pgtable_t)page); } void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) { struct page *page; page = pgtable; call_rcu(&page->rcu_head, pte_free_now); } #endif /* pte_free_defer */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \ (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU)) /* * See the comment above ptep_get_lockless() in include/linux/pgtable.h: * the barriers in pmdp_get_lockless() cannot guarantee that the value in * pmd_high actually belongs with the value in pmd_low; but holding interrupts * off blocks the TLB flush between present updates, which guarantees that a * successful __pte_offset_map() points to a page from matched halves. */ static unsigned long pmdp_get_lockless_start(void) { unsigned long irqflags; local_irq_save(irqflags); return irqflags; } static void pmdp_get_lockless_end(unsigned long irqflags) { local_irq_restore(irqflags); } #else static unsigned long pmdp_get_lockless_start(void) { return 0; } static void pmdp_get_lockless_end(unsigned long irqflags) { } #endif pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { unsigned long irqflags; pmd_t pmdval; rcu_read_lock(); irqflags = pmdp_get_lockless_start(); pmdval = pmdp_get_lockless(pmd); pmdp_get_lockless_end(irqflags); if (pmdvalp) *pmdvalp = pmdval; if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) goto nomap; if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval))) goto nomap; if (unlikely(pmd_bad(pmdval))) { pmd_clear_bad(pmd); goto nomap; } return __pte_map(&pmdval, addr); nomap: rcu_read_unlock(); return NULL; } pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { pmd_t pmdval; pte_t *pte; pte = __pte_offset_map(pmd, addr, &pmdval); if (likely(pte)) *ptlp = pte_lockptr(mm, &pmdval); return pte; } pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp, spinlock_t **ptlp) { pte_t *pte; VM_WARN_ON_ONCE(!pmdvalp); pte = __pte_offset_map(pmd, addr, pmdvalp); if (likely(pte)) *ptlp = pte_lockptr(mm, pmdvalp); return pte; } /* * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation * __pte_offset_map_lock() below, is usually called with the pmd pointer for * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while * holding mmap_lock or vma lock for read or for write; or in truncate or rmap * context, while holding file's i_mmap_lock or anon_vma lock for read (or for * write). In a few cases, it may be used with pmd pointing to a pmd_t already * copied to or constructed on the stack. * * When successful, it returns the pte pointer for addr, with its page table * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent * modification by software, with a pointer to that spinlock in ptlp (in some * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's * struct page). pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards. * * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no * page table at *pmd: if, for example, the page table has just been removed, * or replaced by the huge pmd of a THP. (When successful, *pmd is rechecked * after acquiring the ptlock, and retried internally if it changed: so that a * page table can be safely removed or replaced by THP while holding its lock.) * * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above, * just returns the pte pointer for addr, its page table kmapped if necessary; * or NULL if there is no page table at *pmd. It does not attempt to lock the * page table, so cannot normally be used when the page table is to be updated, * or when entries read must be stable. But it does take rcu_read_lock(): so * that even when page table is racily removed, it remains a valid though empty * and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s * afterwards. * * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); * but when successful, it also outputs a pointer to the spinlock in ptlp - as * pte_offset_map_lock() does, but in this case without locking it. This helps * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time * act on a changed *pmd: pte_offset_map_ro_nolock() provides the correct spinlock * pointer for the page table that it returns. Even after grabbing the spinlock, * we might be looking either at a page table that is still mapped or one that * was unmapped and is about to get freed. But for R/O access this is sufficient. * So it is only applicable for read-only cases where any modification operations * to the page table are not allowed even if the corresponding spinlock is held * afterwards. * * pte_offset_map_rw_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like * pte_offset_map_ro_nolock(); but when successful, it also outputs the pdmval. * It is applicable for may-write cases where any modification operations to the * page table may happen after the corresponding spinlock is held afterwards. * But the users should make sure the page table is stable like checking pte_same() * or checking pmd_same() by using the output pmdval before performing the write * operations. * * Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will * be read-only/read-write protected. * * Note that free_pgtables(), used after unmapping detached vmas, or when * exiting the whole mm, does not take page table lock before freeing a page * table, and may not use RCU at all: "outsiders" like khugepaged should avoid * pte_offset_map() and co once the vma is detached from mm or mm_users is zero. */ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { spinlock_t *ptl; pmd_t pmdval; pte_t *pte; again: pte = __pte_offset_map(pmd, addr, &pmdval); if (unlikely(!pte)) return pte; ptl = pte_lockptr(mm, &pmdval); spin_lock(ptl); if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) { *ptlp = ptl; return pte; } pte_unmap_unlock(pte, ptl); goto again; } |
| 348 13 11 4 11 3 8 1 8 11 12 12 99 128 128 128 84 126 95 35 99 30 84 84 84 30 30 30 173 170 67 98 99 99 40 99 99 99 99 99 84 99 99 99 99 99 99 99 99 99 98 83 99 99 8 99 84 98 99 172 99 173 173 173 173 172 99 99 99 99 7 99 343 343 343 99 99 99 99 7 40 273 300 300 27 20 14 27 2 13 13 7 12 27 27 173 12 173 173 30 173 292 293 37 293 293 293 233 233 127 150 150 292 4 4 227 1 227 227 227 125 27 27 27 11 4 11 3 2 2 2 2 6 7 7 7 347 347 348 348 348 348 292 146 24 121 121 292 343 348 7 3904 1 3909 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/swapfile.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie */ #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/shmem_fs.h> #include <linux/blk-cgroup.h> #include <linux/random.h> #include <linux/writeback.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/backing-dev.h> #include <linux/mutex.h> #include <linux/capability.h> #include <linux/syscalls.h> #include <linux/memcontrol.h> #include <linux/poll.h> #include <linux/oom.h> #include <linux/swapfile.h> #include <linux/export.h> #include <linux/swap_slots.h> #include <linux/sort.h> #include <linux/completion.h> #include <linux/suspend.h> #include <linux/zswap.h> #include <linux/plist.h> #include <asm/tlbflush.h> #include <linux/swapops.h> #include <linux/swap_cgroup.h> #include "internal.h" #include "swap.h" static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); static void swap_entry_range_free(struct swap_info_struct *si, struct swap_cluster_info *ci, swp_entry_t entry, unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset); static inline void unlock_cluster(struct swap_cluster_info *ci); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; atomic_long_t nr_swap_pages; /* * Some modules use swappable objects and may try to swap them out under * memory pressure (via the shrinker). Before doing so, they may wish to * check to see if any swap space is available. */ EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority = -1; unsigned long swapfile_maximum_size; #ifdef CONFIG_MIGRATION bool swap_migration_ad_supported; #endif /* CONFIG_MIGRATION */ static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; /* * all active swap_info_structs * protected with swap_lock, and ordered by priority. */ static PLIST_HEAD(swap_active_head); /* * all available (active, not full) swap_info_structs * protected with swap_avail_lock, ordered by priority. * This is used by folio_alloc_swap() instead of swap_active_head * because swap_active_head includes all swap_info_structs, * but folio_alloc_swap() doesn't need to look at full ones. * This uses its own lock instead of swap_lock because when a * swap_info_struct changes between not-full/full, it needs to * add/remove itself to/from this list, but the swap_info_struct->lock * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); static struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); /* Activity counter to indicate that a swapon or swapoff has occurred */ static atomic_t proc_poll_event = ATOMIC_INIT(0); atomic_t nr_rotate_swap = ATOMIC_INIT(0); static struct swap_info_struct *swap_type_to_swap_info(int type) { if (type >= MAX_SWAPFILES) return NULL; return READ_ONCE(swap_info[type]); /* rcu_dereference() */ } static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ } /* * Use the second highest bit of inuse_pages counter as the indicator * if one swap device is on the available plist, so the atomic can * still be updated arithmetically while having special data embedded. * * inuse_pages counter is the only thing indicating if a device should * be on avail_lists or not (except swapon / swapoff). By embedding the * off-list bit in the atomic counter, updates no longer need any lock * to check the list status. * * This bit will be set if the device is not on the plist and not * usable, will be cleared if the device is on the plist. */ #define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2)) #define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT) static long swap_usage_in_pages(struct swap_info_struct *si) { return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK; } /* Reclaim the swap entry anyway if possible */ #define TTRS_ANYWAY 0x1 /* * Reclaim the swap entry if there are no more mappings of the * corresponding page */ #define TTRS_UNMAPPED 0x2 /* Reclaim the swap entry if swap is getting full */ #define TTRS_FULL 0x4 /* Reclaim directly, bypass the slot cache and don't touch device lock */ #define TTRS_DIRECT 0x8 static bool swap_is_has_cache(struct swap_info_struct *si, unsigned long offset, int nr_pages) { unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; do { VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); if (*map != SWAP_HAS_CACHE) return false; } while (++map < map_end); return true; } static bool swap_is_last_map(struct swap_info_struct *si, unsigned long offset, int nr_pages, bool *has_cache) { unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; unsigned char count = *map; if (swap_count(count) != 1) return false; while (++map < map_end) { if (*map != count) return false; } *has_cache = !!(count & SWAP_HAS_CACHE); return true; } /* * returns number of pages in the folio that backs the swap entry. If positive, * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no * folio was associated with the swap entry. */ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); struct address_space *address_space = swap_address_space(entry); struct swap_cluster_info *ci; struct folio *folio; int ret, nr_pages; bool need_reclaim; folio = filemap_get_folio(address_space, swap_cache_index(entry)); if (IS_ERR(folio)) return 0; nr_pages = folio_nr_pages(folio); ret = -nr_pages; /* * When this function is called from scan_swap_map_slots() and it's * called by vmscan.c at reclaiming folios. So we hold a folio lock * here. We have to use trylock for avoiding deadlock. This is a special * case and you should use folio_free_swap() with explicit folio_lock() * in usual operations. */ if (!folio_trylock(folio)) goto out; /* offset could point to the middle of a large folio */ entry = folio->swap; offset = swp_offset(entry); need_reclaim = ((flags & TTRS_ANYWAY) || ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); if (!need_reclaim || !folio_swapcache_freeable(folio)) goto out_unlock; /* * It's safe to delete the folio from swap cache only if the folio's * swap_map is HAS_CACHE only, which means the slots have no page table * reference or pending writeback, and can't be allocated to others. */ ci = lock_cluster(si, offset); need_reclaim = swap_is_has_cache(si, offset, nr_pages); unlock_cluster(ci); if (!need_reclaim) goto out_unlock; if (!(flags & TTRS_DIRECT)) { /* Free through slot cache */ delete_from_swap_cache(folio); folio_set_dirty(folio); ret = nr_pages; goto out_unlock; } xa_lock_irq(&address_space->i_pages); __delete_from_swap_cache(folio, entry, NULL); xa_unlock_irq(&address_space->i_pages); folio_ref_sub(folio, nr_pages); folio_set_dirty(folio); ci = lock_cluster(si, offset); swap_entry_range_free(si, ci, entry, nr_pages); unlock_cluster(ci); ret = nr_pages; out_unlock: folio_unlock(folio); out: folio_put(folio); return ret; } static inline struct swap_extent *first_se(struct swap_info_struct *sis) { struct rb_node *rb = rb_first(&sis->swap_extent_root); return rb_entry(rb, struct swap_extent, rb_node); } static inline struct swap_extent *next_se(struct swap_extent *se) { struct rb_node *rb = rb_next(&se->rb_node); return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; } /* * swapon tell device that all the old swap contents can be discarded, * to allow the swap device to optimize its wear-levelling. */ static int discard_swap(struct swap_info_struct *si) { struct swap_extent *se; sector_t start_block; sector_t nr_blocks; int err = 0; /* Do not discard the swap header page! */ se = first_se(si); start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { err = blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_KERNEL); if (err) return err; cond_resched(); } for (se = next_se(se); se; se = next_se(se)) { start_block = se->start_block << (PAGE_SHIFT - 9); nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_KERNEL); if (err) break; cond_resched(); } return err; /* That will often be -EOPNOTSUPP */ } static struct swap_extent * offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) { struct swap_extent *se; struct rb_node *rb; rb = sis->swap_extent_root.rb_node; while (rb) { se = rb_entry(rb, struct swap_extent, rb_node); if (offset < se->start_page) rb = rb->rb_left; else if (offset >= se->start_page + se->nr_pages) rb = rb->rb_right; else return se; } /* It *must* be present */ BUG(); } sector_t swap_folio_sector(struct folio *folio) { struct swap_info_struct *sis = swp_swap_info(folio->swap); struct swap_extent *se; sector_t sector; pgoff_t offset; offset = swp_offset(folio->swap); se = offset_to_swap_extent(sis, offset); sector = se->start_block + (offset - se->start_page); return sector << (PAGE_SHIFT - 9); } /* * swap allocation tell device that a cluster of swap can now be discarded, * to allow the swap device to optimize its wear-levelling. */ static void discard_swap_cluster(struct swap_info_struct *si, pgoff_t start_page, pgoff_t nr_pages) { struct swap_extent *se = offset_to_swap_extent(si, start_page); while (nr_pages) { pgoff_t offset = start_page - se->start_page; sector_t start_block = se->start_block + offset; sector_t nr_blocks = se->nr_pages - offset; if (nr_blocks > nr_pages) nr_blocks = nr_pages; start_page += nr_blocks; nr_pages -= nr_blocks; start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_NOIO)) break; se = next_se(se); } } #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR #define swap_entry_order(order) (order) #else #define SWAPFILE_CLUSTER 256 /* * Define swap_entry_order() as constant to let compiler to optimize * out some code if !CONFIG_THP_SWAP */ #define swap_entry_order(order) 0 #endif #define LATENCY_LIMIT 256 static inline bool cluster_is_empty(struct swap_cluster_info *info) { return info->count == 0; } static inline bool cluster_is_discard(struct swap_cluster_info *info) { return info->flags == CLUSTER_FLAG_DISCARD; } static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) { if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) return false; if (!order) return true; return cluster_is_empty(ci) || order == ci->order; } static inline unsigned int cluster_index(struct swap_info_struct *si, struct swap_cluster_info *ci) { return ci - si->cluster_info; } static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si, unsigned long offset) { return &si->cluster_info[offset / SWAPFILE_CLUSTER]; } static inline unsigned int cluster_offset(struct swap_info_struct *si, struct swap_cluster_info *ci) { return cluster_index(si, ci) * SWAPFILE_CLUSTER; } static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset) { struct swap_cluster_info *ci; ci = offset_to_cluster(si, offset); spin_lock(&ci->lock); return ci; } static inline void unlock_cluster(struct swap_cluster_info *ci) { spin_unlock(&ci->lock); } static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, enum swap_cluster_flags new_flags) { VM_WARN_ON(ci->flags == new_flags); BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX); lockdep_assert_held(&ci->lock); spin_lock(&si->lock); if (ci->flags == CLUSTER_FLAG_NONE) list_add_tail(&ci->list, list); else list_move_tail(&ci->list, list); spin_unlock(&si->lock); if (ci->flags == CLUSTER_FLAG_FRAG) atomic_long_dec(&si->frag_cluster_nr[ci->order]); else if (new_flags == CLUSTER_FLAG_FRAG) atomic_long_inc(&si->frag_cluster_nr[ci->order]); ci->flags = new_flags; } /* Add a cluster to discard list and schedule it to do discard */ static void swap_cluster_schedule_discard(struct swap_info_struct *si, struct swap_cluster_info *ci) { unsigned int idx = cluster_index(si, ci); /* * If scan_swap_map_slots() can't find a free cluster, it will check * si->swap_map directly. To make sure the discarding cluster isn't * taken by scan_swap_map_slots(), mark the swap entries bad (occupied). * It will be cleared after discard */ memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); move_cluster(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD); schedule_work(&si->discard_work); } static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { lockdep_assert_held(&ci->lock); move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); ci->order = 0; } /* * Isolate and lock the first cluster that is not contented on a list, * clean its flag before taken off-list. Cluster flag must be in sync * with list status, so cluster updaters can always know the cluster * list status without touching si lock. * * Note it's possible that all clusters on a list are contented so * this returns NULL for an non-empty list. */ static struct swap_cluster_info *isolate_lock_cluster( struct swap_info_struct *si, struct list_head *list) { struct swap_cluster_info *ci, *ret = NULL; spin_lock(&si->lock); if (unlikely(!(si->flags & SWP_WRITEOK))) goto out; list_for_each_entry(ci, list, list) { if (!spin_trylock(&ci->lock)) continue; /* We may only isolate and clear flags of following lists */ VM_BUG_ON(!ci->flags); VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE && ci->flags != CLUSTER_FLAG_FULL); list_del(&ci->list); ci->flags = CLUSTER_FLAG_NONE; ret = ci; break; } out: spin_unlock(&si->lock); return ret; } /* * Doing discard actually. After a cluster discard is finished, the cluster * will be added to free cluster list. Discard cluster is a bit special as * they don't participate in allocation or reclaim, so clusters marked as * CLUSTER_FLAG_DISCARD must remain off-list or on discard list. */ static bool swap_do_scheduled_discard(struct swap_info_struct *si) { struct swap_cluster_info *ci; bool ret = false; unsigned int idx; spin_lock(&si->lock); while (!list_empty(&si->discard_clusters)) { ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); /* * Delete the cluster from list to prepare for discard, but keep * the CLUSTER_FLAG_DISCARD flag, there could be percpu_cluster * pointing to it, or ran into by relocate_cluster. */ list_del(&ci->list); idx = cluster_index(si, ci); spin_unlock(&si->lock); discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, SWAPFILE_CLUSTER); spin_lock(&ci->lock); /* * Discard is done, clear its flags as it's off-list, then * return the cluster to allocation list. */ ci->flags = CLUSTER_FLAG_NONE; memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); __free_cluster(si, ci); spin_unlock(&ci->lock); ret = true; spin_lock(&si->lock); } spin_unlock(&si->lock); return ret; } static void swap_discard_work(struct work_struct *work) { struct swap_info_struct *si; si = container_of(work, struct swap_info_struct, discard_work); swap_do_scheduled_discard(si); } static void swap_users_ref_free(struct percpu_ref *ref) { struct swap_info_struct *si; si = container_of(ref, struct swap_info_struct, users); complete(&si->comp); } /* * Must be called after freeing if ci->count == 0, moves the cluster to free * or discard list. */ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { VM_BUG_ON(ci->count != 0); VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); lockdep_assert_held(&ci->lock); /* * If the swap is discardable, prepare discard the cluster * instead of free it immediately. The cluster will be freed * after discard. */ if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == (SWP_WRITEOK | SWP_PAGE_DISCARD)) { swap_cluster_schedule_discard(si, ci); return; } __free_cluster(si, ci); } /* * Must be called after freeing if ci->count != 0, moves the cluster to * nonfull list. */ static void partial_free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER); lockdep_assert_held(&ci->lock); if (ci->flags != CLUSTER_FLAG_NONFULL) move_cluster(si, ci, &si->nonfull_clusters[ci->order], CLUSTER_FLAG_NONFULL); } /* * Must be called after allocation, moves the cluster to full or frag list. * Note: allocation doesn't acquire si lock, and may drop the ci lock for * reclaim, so the cluster could be any where when called. */ static void relocate_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { lockdep_assert_held(&ci->lock); /* Discard cluster must remain off-list or on discard list */ if (cluster_is_discard(ci)) return; if (!ci->count) { free_cluster(si, ci); } else if (ci->count != SWAPFILE_CLUSTER) { if (ci->flags != CLUSTER_FLAG_FRAG) move_cluster(si, ci, &si->frag_clusters[ci->order], CLUSTER_FLAG_FRAG); } else { if (ci->flags != CLUSTER_FLAG_FULL) move_cluster(si, ci, &si->full_clusters, CLUSTER_FLAG_FULL); } } /* * The cluster corresponding to page_nr will be used. The cluster will not be * added to free cluster list and its usage counter will be increased by 1. * Only used for initialization. */ static void inc_cluster_info_page(struct swap_info_struct *si, struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; ci = cluster_info + idx; ci->count++; VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); VM_BUG_ON(ci->flags); } static bool cluster_reclaim_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned long start, unsigned long end) { unsigned char *map = si->swap_map; unsigned long offset = start; int nr_reclaim; spin_unlock(&ci->lock); do { switch (READ_ONCE(map[offset])) { case 0: offset++; break; case SWAP_HAS_CACHE: nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); if (nr_reclaim > 0) offset += nr_reclaim; else goto out; break; default: goto out; } } while (offset < end); out: spin_lock(&ci->lock); /* * Recheck the range no matter reclaim succeeded or not, the slot * could have been be freed while we are not holding the lock. */ for (offset = start; offset < end; offset++) if (READ_ONCE(map[offset])) return false; return true; } static bool cluster_scan_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned long start, unsigned int nr_pages, bool *need_reclaim) { unsigned long offset, end = start + nr_pages; unsigned char *map = si->swap_map; for (offset = start; offset < end; offset++) { switch (READ_ONCE(map[offset])) { case 0: continue; case SWAP_HAS_CACHE: if (!vm_swap_full()) return false; *need_reclaim = true; continue; default: return false; } } return true; } static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned int start, unsigned char usage, unsigned int order) { unsigned int nr_pages = 1 << order; lockdep_assert_held(&ci->lock); if (!(si->flags & SWP_WRITEOK)) return false; /* * The first allocation in a cluster makes the * cluster exclusive to this order */ if (cluster_is_empty(ci)) ci->order = order; memset(si->swap_map + start, usage, nr_pages); swap_range_alloc(si, nr_pages); ci->count += nr_pages; return true; } /* Try use a new cluster for current CPU and allocate from it. */ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned long offset, unsigned int order, unsigned char usage) { unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID; unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER); unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); unsigned int nr_pages = 1 << order; bool need_reclaim, ret; lockdep_assert_held(&ci->lock); if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER) goto out; for (end -= nr_pages; offset <= end; offset += nr_pages) { need_reclaim = false; if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim)) continue; if (need_reclaim) { ret = cluster_reclaim_range(si, ci, offset, offset + nr_pages); /* * Reclaim drops ci->lock and cluster could be used * by another order. Not checking flag as off-list * cluster has no flag set, and change of list * won't cause fragmentation. */ if (!cluster_is_usable(ci, order)) goto out; if (cluster_is_empty(ci)) offset = start; /* Reclaim failed but cluster is usable, try next */ if (!ret) continue; } if (!cluster_alloc_range(si, ci, offset, usage, order)) break; found = offset; offset += nr_pages; if (ci->count < SWAPFILE_CLUSTER && offset <= end) next = offset; break; } out: relocate_cluster(si, ci); unlock_cluster(ci); if (si->flags & SWP_SOLIDSTATE) __this_cpu_write(si->percpu_cluster->next[order], next); else si->global_cluster->next[order] = next; return found; } /* Return true if reclaimed a whole cluster */ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) { long to_scan = 1; unsigned long offset, end; struct swap_cluster_info *ci; unsigned char *map = si->swap_map; int nr_reclaim; if (force) to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; while (offset < end) { if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { spin_unlock(&ci->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); spin_lock(&ci->lock); if (nr_reclaim) { offset += abs(nr_reclaim); continue; } } offset++; } unlock_cluster(ci); if (to_scan <= 0) break; } } static void swap_reclaim_work(struct work_struct *work) { struct swap_info_struct *si; si = container_of(work, struct swap_info_struct, reclaim_work); swap_reclaim_full_clusters(si, true); } /* * Try to get swap entries with specified order from current cpu's swap entry * pool (a cluster). This might involve allocating a new cluster for current CPU * too. */ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, unsigned char usage) { struct swap_cluster_info *ci; unsigned int offset, found = 0; if (si->flags & SWP_SOLIDSTATE) { /* Fast path using per CPU cluster */ local_lock(&si->percpu_cluster->lock); offset = __this_cpu_read(si->percpu_cluster->next[order]); } else { /* Serialize HDD SWAP allocation for each device. */ spin_lock(&si->global_cluster_lock); offset = si->global_cluster->next[order]; } if (offset) { ci = lock_cluster(si, offset); /* Cluster could have been used by another order */ if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) offset = cluster_offset(si, ci); found = alloc_swap_scan_cluster(si, ci, offset, order, usage); } else { unlock_cluster(ci); } if (found) goto done; } new_cluster: ci = isolate_lock_cluster(si, &si->free_clusters); if (ci) { found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), order, usage); if (found) goto done; } /* Try reclaim from full clusters if free clusters list is drained */ if (vm_swap_full()) swap_reclaim_full_clusters(si, false); if (order < PMD_ORDER) { unsigned int frags = 0, frags_existing; while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) { found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), order, usage); if (found) goto done; /* Clusters failed to allocate are moved to frag_clusters */ frags++; } frags_existing = atomic_long_read(&si->frag_cluster_nr[order]); while (frags < frags_existing && (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) { atomic_long_dec(&si->frag_cluster_nr[order]); /* * Rotate the frag list to iterate, they were all * failing high order allocation or moved here due to * per-CPU usage, but they could contain newly released * reclaimable (eg. lazy-freed swap cache) slots. */ found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), order, usage); if (found) goto done; frags++; } } /* * We don't have free cluster but have some clusters in * discarding, do discard now and reclaim them, then * reread cluster_next_cpu since we dropped si->lock */ if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si)) goto new_cluster; if (order) goto done; /* Order 0 stealing from higher order */ for (int o = 1; o < SWAP_NR_ORDERS; o++) { /* * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user. */ while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) { atomic_long_dec(&si->frag_cluster_nr[o]); found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), 0, usage); if (found) goto done; } while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) { found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), 0, usage); if (found) goto done; } } done: if (si->flags & SWP_SOLIDSTATE) local_unlock(&si->percpu_cluster->lock); else spin_unlock(&si->global_cluster_lock); return found; } /* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) { int nid; unsigned long pages; spin_lock(&swap_avail_lock); if (swapoff) { /* * Forcefully remove it. Clear the SWP_WRITEOK flags for * swapoff here so it's synchronized by both si->lock and * swap_avail_lock, to ensure the result can be seen by * add_to_avail_list. */ lockdep_assert_held(&si->lock); si->flags &= ~SWP_WRITEOK; atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); } else { /* * If not called by swapoff, take it off-list only if it's * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly * si->inuse_pages == pages), any concurrent slot freeing, * or device already removed from plist by someone else * will make this return false. */ pages = si->pages; if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages, pages | SWAP_USAGE_OFFLIST_BIT)) goto skip; } for_each_node(nid) plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); skip: spin_unlock(&swap_avail_lock); } /* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) { int nid; long val; unsigned long pages; spin_lock(&swap_avail_lock); /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */ if (swapon) { lockdep_assert_held(&si->lock); si->flags |= SWP_WRITEOK; } else { if (!(READ_ONCE(si->flags) & SWP_WRITEOK)) goto skip; } if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT)) goto skip; val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages); /* * When device is full and device is on the plist, only one updater will * see (inuse_pages == si->pages) and will call del_from_avail_list. If * that updater happen to be here, just skip adding. */ pages = si->pages; if (val == pages) { /* Just like the cmpxchg in del_from_avail_list */ if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages, pages | SWAP_USAGE_OFFLIST_BIT)) goto skip; } for_each_node(nid) plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); skip: spin_unlock(&swap_avail_lock); } /* * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock * within each cluster, so the total contribution to the global counter should * always be positive and cannot exceed the total number of usable slots. */ static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries) { long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages); /* * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set, * remove it from the plist. */ if (unlikely(val == si->pages)) { del_from_avail_list(si, false); return true; } return false; } static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries) { long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages); /* * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set, * remove it from the plist. */ if (unlikely(val & SWAP_USAGE_OFFLIST_BIT)) add_to_avail_list(si, false); } static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries) { if (swap_usage_add(si, nr_entries)) { if (vm_swap_full()) schedule_work(&si->reclaim_work); } } static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { unsigned long begin = offset; unsigned long end = offset + nr_entries - 1; void (*swap_slot_free_notify)(struct block_device *, unsigned long); unsigned int i; /* * Use atomic clear_bit operations only on zeromap instead of non-atomic * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. */ for (i = 0; i < nr_entries; i++) { clear_bit(offset + i, si->zeromap); zswap_invalidate(swp_entry(si->type, offset + i)); } if (si->flags & SWP_BLKDEV) swap_slot_free_notify = si->bdev->bd_disk->fops->swap_slot_free_notify; else swap_slot_free_notify = NULL; while (offset <= end) { arch_swap_invalidate_page(si->type, offset); if (swap_slot_free_notify) swap_slot_free_notify(si->bdev, offset); offset++; } clear_shadow_from_swap_cache(si->type, begin, end); /* * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 * only after the above cleanups are done. */ smp_wmb(); atomic_long_add(nr_entries, &nr_swap_pages); swap_usage_sub(si, nr_entries); } static int cluster_alloc_swap(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) { int n_ret = 0; while (n_ret < nr) { unsigned long offset = cluster_alloc_swap_entry(si, order, usage); if (!offset) break; slots[n_ret++] = swp_entry(si->type, offset); } return n_ret; } static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) { unsigned int nr_pages = 1 << order; /* * We try to cluster swap pages by allocating them sequentially * in swap. Once we've allocated SWAPFILE_CLUSTER pages this * way, however, we resort to first-free allocation, starting * a new cluster. This prevents us from scattering swap pages * all over the entire swap partition, so that we reduce * overall disk seek times between swap pages. -- sct * But we do now try to find an empty cluster. -Andrea * And we let swap pages go all over an SSD partition. Hugh */ if (order > 0) { /* * Should not even be attempting large allocations when huge * page swap is disabled. Warn and fail the allocation. */ if (!IS_ENABLED(CONFIG_THP_SWAP) || nr_pages > SWAPFILE_CLUSTER) { VM_WARN_ON_ONCE(1); return 0; } /* * Swapfile is not block device so unable * to allocate large entries. */ if (!(si->flags & SWP_BLKDEV)) return 0; } return cluster_alloc_swap(si, usage, nr, slots, order); } static bool get_swap_device_info(struct swap_info_struct *si) { if (!percpu_ref_tryget_live(&si->users)) return false; /* * Guarantee the si->users are checked before accessing other * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is * up to dated. * * Paired with the spin_unlock() after setup_swap_info() in * enable_swap_info(), and smp_wmb() in swapoff. */ smp_rmb(); return true; } int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) { int order = swap_entry_order(entry_order); unsigned long size = 1 << order; struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; int node; spin_lock(&swap_avail_lock); avail_pgs = atomic_long_read(&nr_swap_pages) / size; if (avail_pgs <= 0) { spin_unlock(&swap_avail_lock); goto noswap; } n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); atomic_long_sub(n_goal * size, &nr_swap_pages); start_over: node = numa_node_id(); plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { /* requeue si to after same-priority siblings */ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal, swp_entries, order); put_swap_device(si); if (n_ret || size > 1) goto check_out; } spin_lock(&swap_avail_lock); /* * if we got here, it's likely that si was almost full before, * and since scan_swap_map_slots() can drop the si->lock, * multiple callers probably all tried to get a page from the * same si and it filled up before we could get one; or, the si * filled up between us dropping swap_avail_lock and taking * si->lock. Since we dropped the swap_avail_lock, the * swap_avail_head list may have been modified; so if next is * still in the swap_avail_head list then try it, otherwise * start over if we have not gotten any slots. */ if (plist_node_empty(&next->avail_lists[node])) goto start_over; } spin_unlock(&swap_avail_lock); check_out: if (n_ret < n_goal) atomic_long_add((long)(n_goal - n_ret) * size, &nr_swap_pages); noswap: return n_ret; } static struct swap_info_struct *_swap_info_get(swp_entry_t entry) { struct swap_info_struct *si; unsigned long offset; if (!entry.val) goto out; si = swp_swap_info(entry); if (!si) goto bad_nofile; if (data_race(!(si->flags & SWP_USED))) goto bad_device; offset = swp_offset(entry); if (offset >= si->max) goto bad_offset; if (data_race(!si->swap_map[swp_offset(entry)])) goto bad_free; return si; bad_free: pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); goto out; bad_offset: pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); goto out; bad_device: pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val); goto out; bad_nofile: pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); out: return NULL; } static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, unsigned long offset, unsigned char usage) { unsigned char count; unsigned char has_cache; count = si->swap_map[offset]; has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; if (usage == SWAP_HAS_CACHE) { VM_BUG_ON(!has_cache); has_cache = 0; } else if (count == SWAP_MAP_SHMEM) { /* * Or we could insist on shmem.c using a special * swap_shmem_free() and free_shmem_swap_and_cache()... */ count = 0; } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { if (count == COUNT_CONTINUED) { if (swap_count_continued(si, offset, count)) count = SWAP_MAP_MAX | COUNT_CONTINUED; else count = SWAP_MAP_MAX; } else count--; } usage = count | has_cache; if (usage) WRITE_ONCE(si->swap_map[offset], usage); else WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE); return usage; } /* * When we get a swap entry, if there aren't some other ways to * prevent swapoff, such as the folio in swap cache is locked, RCU * reader side is locked, etc., the swap entry may become invalid * because of swapoff. Then, we need to enclose all swap related * functions with get_swap_device() and put_swap_device(), unless the * swap functions call get/put_swap_device() by themselves. * * RCU reader side lock (including any spinlock) is sufficient to * prevent swapoff, because synchronize_rcu() is called in swapoff() * before freeing data structures. * * Check whether swap entry is valid in the swap device. If so, * return pointer to swap_info_struct, and keep the swap entry valid * via preventing the swap device from being swapoff, until * put_swap_device() is called. Otherwise return NULL. * * Notice that swapoff or swapoff+swapon can still happen before the * percpu_ref_tryget_live() in get_swap_device() or after the * percpu_ref_put() in put_swap_device() if there isn't any other way * to prevent swapoff. The caller must be prepared for that. For * example, the following situation is possible. * * CPU1 CPU2 * do_swap_page() * ... swapoff+swapon * __read_swap_cache_async() * swapcache_prepare() * __swap_duplicate() * // check swap_map * // verify PTE not changed * * In __swap_duplicate(), the swap_map need to be checked before * changing partly because the specified swap entry may be for another * swap device which has been swapoff. And in do_swap_page(), after * the page is read from the swap device, the PTE is verified not * changed with the page table locked to check whether the swap device * has been swapoff or swapoff+swapon. */ struct swap_info_struct *get_swap_device(swp_entry_t entry) { struct swap_info_struct *si; unsigned long offset; if (!entry.val) goto out; si = swp_swap_info(entry); if (!si) goto bad_nofile; if (!get_swap_device_info(si)) goto out; offset = swp_offset(entry); if (offset >= si->max) goto put_out; return si; bad_nofile: pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); out: return NULL; put_out: pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); percpu_ref_put(&si->users); return NULL; } static unsigned char __swap_entry_free(struct swap_info_struct *si, swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char usage; ci = lock_cluster(si, offset); usage = __swap_entry_free_locked(si, offset, 1); if (!usage) swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); unlock_cluster(ci); return usage; } static bool __swap_entries_free(struct swap_info_struct *si, swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); unsigned int type = swp_type(entry); struct swap_cluster_info *ci; bool has_cache = false; unsigned char count; int i; if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1) goto fallback; /* cross into another cluster */ if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) goto fallback; ci = lock_cluster(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { unlock_cluster(ci); goto fallback; } for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); if (!has_cache) swap_entry_range_free(si, ci, entry, nr); unlock_cluster(ci); return has_cache; fallback: for (i = 0; i < nr; i++) { if (data_race(si->swap_map[offset + i])) { count = __swap_entry_free(si, swp_entry(type, offset + i)); if (count == SWAP_HAS_CACHE) has_cache = true; } else { WARN_ON_ONCE(1); } } return has_cache; } /* * Drop the last HAS_CACHE flag of swap entries, caller have to * ensure all entries belong to the same cgroup. */ static void swap_entry_range_free(struct swap_info_struct *si, struct swap_cluster_info *ci, swp_entry_t entry, unsigned int nr_pages) { unsigned long offset = swp_offset(entry); unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; /* It should never free entries across different clusters */ VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1)); VM_BUG_ON(cluster_is_empty(ci)); VM_BUG_ON(ci->count < nr_pages); ci->count -= nr_pages; do { VM_BUG_ON(*map != SWAP_HAS_CACHE); *map = 0; } while (++map < map_end); mem_cgroup_uncharge_swap(entry, nr_pages); swap_range_free(si, offset, nr_pages); if (!ci->count) free_cluster(si, ci); else partial_free_cluster(si, ci); } static void cluster_swap_free_nr(struct swap_info_struct *si, unsigned long offset, int nr_pages, unsigned char usage) { struct swap_cluster_info *ci; unsigned long end = offset + nr_pages; ci = lock_cluster(si, offset); do { if (!__swap_entry_free_locked(si, offset, usage)) swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); } while (++offset < end); unlock_cluster(ci); } /* * Caller has made sure that the swap device corresponding to entry * is still around or has not been recycled. */ void swap_free_nr(swp_entry_t entry, int nr_pages) { int nr; struct swap_info_struct *sis; unsigned long offset = swp_offset(entry); sis = _swap_info_get(entry); if (!sis) return; while (nr_pages) { nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); cluster_swap_free_nr(sis, offset, nr, 1); offset += nr; nr_pages -= nr; } } /* * Called after dropping swapcache to decrease refcnt to swap entries. */ void put_swap_folio(struct folio *folio, swp_entry_t entry) { unsigned long offset = swp_offset(entry); struct swap_cluster_info *ci; struct swap_info_struct *si; int size = 1 << swap_entry_order(folio_order(folio)); si = _swap_info_get(entry); if (!si) return; ci = lock_cluster(si, offset); if (swap_is_has_cache(si, offset, size)) swap_entry_range_free(si, ci, entry, size); else { for (int i = 0; i < size; i++, entry.val++) { if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) swap_entry_range_free(si, ci, entry, 1); } } unlock_cluster(ci); } void swapcache_free_entries(swp_entry_t *entries, int n) { int i; struct swap_cluster_info *ci; struct swap_info_struct *si = NULL; if (n <= 0) return; for (i = 0; i < n; ++i) { si = _swap_info_get(entries[i]); if (si) { ci = lock_cluster(si, swp_offset(entries[i])); swap_entry_range_free(si, ci, entries[i], 1); unlock_cluster(ci); } } } int __swap_count(swp_entry_t entry) { struct swap_info_struct *si = swp_swap_info(entry); pgoff_t offset = swp_offset(entry); return swap_count(si->swap_map[offset]); } /* * How many references to @entry are currently swapped out? * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; int count; ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); unlock_cluster(ci); return count; } /* * How many references to @entry are currently swapped out? * This considers COUNT_CONTINUED so it returns exact answer. */ int swp_swapcount(swp_entry_t entry) { int count, tmp_count, n; struct swap_info_struct *si; struct swap_cluster_info *ci; struct page *page; pgoff_t offset; unsigned char *map; si = _swap_info_get(entry); if (!si) return 0; offset = swp_offset(entry); ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) goto out; count &= ~COUNT_CONTINUED; n = SWAP_MAP_MAX + 1; page = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; VM_BUG_ON(page_private(page) != SWP_CONTINUED); do { page = list_next_entry(page, lru); map = kmap_local_page(page); tmp_count = map[offset]; kunmap_local(map); count += (tmp_count & ~COUNT_CONTINUED) * n; n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: unlock_cluster(ci); return count; } static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, swp_entry_t entry, int order) { struct swap_cluster_info *ci; unsigned char *map = si->swap_map; unsigned int nr_pages = 1 << order; unsigned long roffset = swp_offset(entry); unsigned long offset = round_down(roffset, nr_pages); int i; bool ret = false; ci = lock_cluster(si, offset); if (nr_pages == 1) { if (swap_count(map[roffset])) ret = true; goto unlock_out; } for (i = 0; i < nr_pages; i++) { if (swap_count(map[offset + i])) { ret = true; break; } } unlock_out: unlock_cluster(ci); return ret; } static bool folio_swapped(struct folio *folio) { swp_entry_t entry = folio->swap; struct swap_info_struct *si = _swap_info_get(entry); if (!si) return false; if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) return swap_swapcount(si, entry) != 0; return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); } static bool folio_swapcache_freeable(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!folio_test_swapcache(folio)) return false; if (folio_test_writeback(folio)) return false; /* * Once hibernation has begun to create its image of memory, * there's a danger that one of the calls to folio_free_swap() * - most probably a call from __try_to_reclaim_swap() while * hibernation is allocating its own swap pages for the image, * but conceivably even a call from memory reclaim - will free * the swap from a folio which has already been recorded in the * image as a clean swapcache folio, and then reuse its swap for * another page of the image. On waking from hibernation, the * original folio might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * * Hibernation suspends storage while it is writing the image * to disk so check that here. */ if (pm_suspended_storage()) return false; return true; } /** * folio_free_swap() - Free the swap space used for this folio. * @folio: The folio to remove. * * If swap is getting full, or if there are no more mappings of this folio, * then call folio_free_swap to free its swap space. * * Return: true if we were able to release the swap space. */ bool folio_free_swap(struct folio *folio) { if (!folio_swapcache_freeable(folio)) return false; if (folio_swapped(folio)) return false; delete_from_swap_cache(folio); folio_set_dirty(folio); return true; } /** * free_swap_and_cache_nr() - Release reference on range of swap entries and * reclaim their cache if no more references remain. * @entry: First entry of range. * @nr: Number of entries in range. * * For each swap entry in the contiguous range, release a reference. If any swap * entries become free, try to reclaim their underlying folios, if present. The * offset range is defined by [entry.offset, entry.offset + nr). */ void free_swap_and_cache_nr(swp_entry_t entry, int nr) { const unsigned long start_offset = swp_offset(entry); const unsigned long end_offset = start_offset + nr; struct swap_info_struct *si; bool any_only_cache = false; unsigned long offset; if (non_swap_entry(entry)) return; si = get_swap_device(entry); if (!si) return; if (WARN_ON(end_offset > si->max)) goto out; /* * First free all entries in the range. */ any_only_cache = __swap_entries_free(si, entry, nr); /* * Short-circuit the below loop if none of the entries had their * reference drop to zero. */ if (!any_only_cache) goto out; /* * Now go back over the range trying to reclaim the swap cache. This is * more efficient for large folios because we will only try to reclaim * the swap once per folio in the common case. If we do * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the * latter will get a reference and lock the folio for every individual * page but will only succeed once the swap slot for every subpage is * zero. */ for (offset = start_offset; offset < end_offset; offset += nr) { nr = 1; if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { /* * Folios are always naturally aligned in swap so * advance forward to the next boundary. Zero means no * folio was found for the swap entry, so advance by 1 * in this case. Negative value means folio was found * but could not be reclaimed. Here we can still advance * to the next boundary. */ nr = __try_to_reclaim_swap(si, offset, TTRS_UNMAPPED | TTRS_FULL); if (nr == 0) nr = 1; else if (nr < 0) nr = -nr; nr = ALIGN(offset + 1, nr) - offset; } } out: put_swap_device(si); } #ifdef CONFIG_HIBERNATION swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si = swap_type_to_swap_info(type); swp_entry_t entry = {0}; if (!si) goto fail; /* This is called for allocating swap entry, not cache */ if (get_swap_device_info(si)) { if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) atomic_long_dec(&nr_swap_pages); put_swap_device(si); } fail: return entry; } /* * Find the swap type that corresponds to given device (if any). * * @offset - number of the PAGE_SIZE-sized block of the device, starting * from 0, in which the swap header is expected to be located. * * This is needed for the suspend to disk (aka swsusp). */ int swap_type_of(dev_t device, sector_t offset) { int type; if (!device) return -1; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; if (device == sis->bdev->bd_dev) { struct swap_extent *se = first_se(sis); if (se->start_block == offset) { spin_unlock(&swap_lock); return type; } } } spin_unlock(&swap_lock); return -ENODEV; } int find_first_swap(dev_t *device) { int type; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; *device = sis->bdev->bd_dev; spin_unlock(&swap_lock); return type; } spin_unlock(&swap_lock); return -ENODEV; } /* * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev * corresponding to given index in swap_info (swap type). */ sector_t swapdev_block(int type, pgoff_t offset) { struct swap_info_struct *si = swap_type_to_swap_info(type); struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) return 0; se = offset_to_swap_extent(si, offset); return se->start_block + (offset - se->start_page); } /* * Return either the total number of swap pages of given type, or the number * of free pages of that type (depending on @free) * * This is needed for software suspend */ unsigned int count_swap_pages(int type, int free) { unsigned int n = 0; spin_lock(&swap_lock); if ((unsigned int)type < nr_swapfiles) { struct swap_info_struct *sis = swap_info[type]; spin_lock(&sis->lock); if (sis->flags & SWP_WRITEOK) { n = sis->pages; if (free) n -= swap_usage_in_pages(sis); } spin_unlock(&sis->lock); } spin_unlock(&swap_lock); return n; } #endif /* CONFIG_HIBERNATION */ static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) { return pte_same(pte_swp_clear_flags(pte), swp_pte); } /* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to * force COW, vm_page_prot omits write permission from any private vma. */ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct folio *folio) { struct page *page; struct folio *swapcache; spinlock_t *ptl; pte_t *pte, new_pte, old_pte; bool hwpoisoned = false; int ret = 1; swapcache = folio; folio = ksm_might_need_to_copy(folio, vma, addr); if (unlikely(!folio)) return -ENOMEM; else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { hwpoisoned = true; folio = swapcache; } page = folio_file_page(folio, swp_offset(entry)); if (PageHWPoison(page)) hwpoisoned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), swp_entry_to_pte(entry)))) { ret = 0; goto out; } old_pte = ptep_get(pte); if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) { swp_entry_t swp_entry; dec_mm_counter(vma->vm_mm, MM_SWAPENTS); if (hwpoisoned) { swp_entry = make_hwpoison_entry(page); } else { swp_entry = make_poisoned_swp_entry(); } new_pte = swp_entry_to_pte(swp_entry); ret = 0; goto setpte; } /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ arch_swap_restore(folio_swap(entry, folio), folio); dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); folio_get(folio); if (folio == swapcache) { rmap_t rmap_flags = RMAP_NONE; /* * See do_swap_page(): writeback would be problematic. * However, we do a folio_wait_writeback() just before this * call and have the folio locked. */ VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); if (pte_swp_exclusive(old_pte)) rmap_flags |= RMAP_EXCLUSIVE; /* * We currently only expect small !anon folios, which are either * fully exclusive or fully shared. If we ever get large folios * here, we have to be careful. */ if (!folio_test_anon(folio)) { VM_WARN_ON_ONCE(folio_test_large(folio)); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); folio_add_new_anon_rmap(folio, vma, addr, rmap_flags); } else { folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags); } } else { /* ksm created a completely new copy */ folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); if (pte_swp_soft_dirty(old_pte)) new_pte = pte_mksoft_dirty(new_pte); if (pte_swp_uffd_wp(old_pte)) new_pte = pte_mkuffd_wp(new_pte); setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); swap_free(entry); out: if (pte) pte_unmap_unlock(pte, ptl); if (folio != swapcache) { folio_unlock(folio); folio_put(folio); } return ret; } static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned int type) { pte_t *pte = NULL; struct swap_info_struct *si; si = swap_info[type]; do { struct folio *folio; unsigned long offset; unsigned char swp_count; swp_entry_t entry; int ret; pte_t ptent; if (!pte++) { pte = pte_offset_map(pmd, addr); if (!pte) break; } ptent = ptep_get_lockless(pte); if (!is_swap_pte(ptent)) continue; entry = pte_to_swp_entry(ptent); if (swp_type(entry) != type) continue; offset = swp_offset(entry); pte_unmap(pte); pte = NULL; folio = swap_cache_get_folio(entry, vma, addr); if (!folio) { struct vm_fault vmf = { .vma = vma, .address = addr, .real_address = addr, .pmd = pmd, }; folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); } if (!folio) { swp_count = READ_ONCE(si->swap_map[offset]); if (swp_count == 0 || swp_count == SWAP_MAP_BAD) continue; return -ENOMEM; } folio_lock(folio); folio_wait_writeback(folio); ret = unuse_pte(vma, pmd, addr, entry, folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); return ret; } folio_free_swap(folio); folio_unlock(folio); folio_put(folio); } while (addr += PAGE_SIZE, addr != end); if (pte) pte_unmap(pte); return 0; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, unsigned int type) { pmd_t *pmd; unsigned long next; int ret; pmd = pmd_offset(pud, addr); do { cond_resched(); next = pmd_addr_end(addr, end); ret = unuse_pte_range(vma, pmd, addr, next, type); if (ret) return ret; } while (pmd++, addr = next, addr != end); return 0; } static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned int type) { pud_t *pud; unsigned long next; int ret; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; ret = unuse_pmd_range(vma, pud, addr, next, type); if (ret) return ret; } while (pud++, addr = next, addr != end); return 0; } static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned int type) { p4d_t *p4d; unsigned long next; int ret; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; ret = unuse_pud_range(vma, p4d, addr, next, type); if (ret) return ret; } while (p4d++, addr = next, addr != end); return 0; } static int unuse_vma(struct vm_area_struct *vma, unsigned int type) { pgd_t *pgd; unsigned long addr, end, next; int ret; addr = vma->vm_start; end = vma->vm_end; pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; ret = unuse_p4d_range(vma, pgd, addr, next, type); if (ret) return ret; } while (pgd++, addr = next, addr != end); return 0; } static int unuse_mm(struct mm_struct *mm, unsigned int type) { struct vm_area_struct *vma; int ret = 0; VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); for_each_vma(vmi, vma) { if (vma->anon_vma && !is_vm_hugetlb_page(vma)) { ret = unuse_vma(vma, type); if (ret) break; } cond_resched(); } mmap_read_unlock(mm); return ret; } /* * Scan swap_map from current position to next entry still in use. * Return 0 if there are no inuse entries after prev till end of * the map. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev) { unsigned int i; unsigned char count; /* * No need for swap_lock here: we're just looking * for whether an entry is in use, not modifying it; false * hits are okay, and sys_swapoff() has already prevented new * allocations from this area (while holding swap_lock). */ for (i = prev + 1; i < si->max; i++) { count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); } if (i == si->max) i = 0; return i; } static int try_to_unuse(unsigned int type) { struct mm_struct *prev_mm; struct mm_struct *mm; struct list_head *p; int retval = 0; struct swap_info_struct *si = swap_info[type]; struct folio *folio; swp_entry_t entry; unsigned int i; if (!swap_usage_in_pages(si)) goto success; retry: retval = shmem_unuse(type); if (retval) return retval; prev_mm = &init_mm; mmget(prev_mm); spin_lock(&mmlist_lock); p = &init_mm.mmlist; while (swap_usage_in_pages(si) && !signal_pending(current) && (p = p->next) != &init_mm.mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!mmget_not_zero(mm)) continue; spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; retval = unuse_mm(mm, type); if (retval) { mmput(prev_mm); return retval; } /* * Make sure that we aren't completely killing * interactive performance. */ cond_resched(); spin_lock(&mmlist_lock); } spin_unlock(&mmlist_lock); mmput(prev_mm); i = 0; while (swap_usage_in_pages(si) && !signal_pending(current) && (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (IS_ERR(folio)) continue; /* * It is conceivable that a racing task removed this folio from * swap cache just before we acquired the page lock. The folio * might even be back in swap cache on another swap area. But * that is okay, folio_free_swap() only removes stale folios. */ folio_lock(folio); folio_wait_writeback(folio); folio_free_swap(folio); folio_unlock(folio); folio_put(folio); } /* * Lets check again to see if there are still swap entries in the map. * If yes, we would need to do retry the unuse logic again. * Under global memory pressure, swap entries can be reinserted back * into process space after the mmlist loop above passes over them. * * Limit the number of retries? No: when mmget_not_zero() * above fails, that mm is likely to be freeing swap from * exit_mmap(), which proceeds at its own independent pace; * and even shmem_writepage() could have been preempted after * folio_alloc_swap(), temporarily hiding that swap. It's easy * and robust (though cpu-intensive) just to keep retrying. */ if (swap_usage_in_pages(si)) { if (!signal_pending(current)) goto retry; return -EINTR; } success: /* * Make sure that further cleanups after try_to_unuse() returns happen * after swap_range_free() reduces si->inuse_pages to 0. */ smp_mb(); return 0; } /* * After a successful try_to_unuse, if no swap is now in use, we know * we can empty the mmlist. swap_lock must be held on entry and exit. * Note that mmlist_lock nests inside swap_lock, and an mm must be * added to the mmlist just after page_duplicate - before would be racy. */ static void drain_mmlist(void) { struct list_head *p, *next; unsigned int type; for (type = 0; type < nr_swapfiles; type++) if (swap_usage_in_pages(swap_info[type])) return; spin_lock(&mmlist_lock); list_for_each_safe(p, next, &init_mm.mmlist) list_del_init(p); spin_unlock(&mmlist_lock); } /* * Free all of a swapdev's extent information */ static void destroy_swap_extents(struct swap_info_struct *sis) { while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { struct rb_node *rb = sis->swap_extent_root.rb_node; struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); rb_erase(rb, &sis->swap_extent_root); kfree(se); } if (sis->flags & SWP_ACTIVATED) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; sis->flags &= ~SWP_ACTIVATED; if (mapping->a_ops->swap_deactivate) mapping->a_ops->swap_deactivate(swap_file); } } /* * Add a block range (and the corresponding page range) into this swapdev's * extent tree. * * This function rather assumes that it is called in ascending page order. */ int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; struct swap_extent *se; struct swap_extent *new_se; /* * place the new node at the right most since the * function is called in ascending page order. */ while (*link) { parent = *link; link = &parent->rb_right; } if (parent) { se = rb_entry(parent, struct swap_extent, rb_node); BUG_ON(se->start_page + se->nr_pages != start_page); if (se->start_block + se->nr_pages == start_block) { /* Merge it */ se->nr_pages += nr_pages; return 0; } } /* No merge, insert a new extent. */ new_se = kmalloc(sizeof(*se), GFP_KERNEL); if (new_se == NULL) return -ENOMEM; new_se->start_page = start_page; new_se->nr_pages = nr_pages; new_se->start_block = start_block; rb_link_node(&new_se->rb_node, parent, link); rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); return 1; } EXPORT_SYMBOL_GPL(add_swap_extent); /* * A `swap extent' is a simple thing which maps a contiguous range of pages * onto a contiguous range of disk blocks. A rbtree of swap extents is * built at swapon time and is then used at swap_writepage/swap_read_folio * time for locating where on disk a page belongs. * * If the swapfile is an S_ISBLK block device, a single extent is installed. * This is done so that the main operating code can treat S_ISBLK and S_ISREG * swap files identically. * * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK * swapfiles are handled *identically* after swapon time. * * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray * blocks are found which do not fall within the PAGE_SIZE alignment * requirements, they are simply tossed out - we will never use those blocks * for swapping. * * For all swap devices we set S_SWAPFILE across the life of the swapon. This * prevents users from writing to the swap device, which will corrupt memory. * * The amount of disk space which a single swap extent represents varies. * Typically it is in the 1-4 megabyte range. So we can have hundreds of * extents in the rbtree. - akpm. */ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) { struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; int ret; if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; return ret; } if (mapping->a_ops->swap_activate) { ret = mapping->a_ops->swap_activate(sis, swap_file, span); if (ret < 0) return ret; sis->flags |= SWP_ACTIVATED; if ((sis->flags & SWP_FS_OPS) && sio_pool_init() != 0) { destroy_swap_extents(sis); return -ENOMEM; } return ret; } return generic_swapfile_activate(sis, swap_file, span); } static int swap_node(struct swap_info_struct *si) { struct block_device *bdev; if (si->bdev) bdev = si->bdev; else bdev = si->swap_file->f_inode->i_sb->s_bdev; return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; } static void setup_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, unsigned long *zeromap) { int i; if (prio >= 0) si->prio = prio; else si->prio = --least_priority; /* * the plist prio is negated because plist ordering is * low-to-high, while swap ordering is high-to-low */ si->list.prio = -si->prio; for_each_node(i) { if (si->prio >= 0) si->avail_lists[i].prio = -si->prio; else { if (swap_node(si) == i) si->avail_lists[i].prio = 1; else si->avail_lists[i].prio = -si->prio; } } si->swap_map = swap_map; si->cluster_info = cluster_info; si->zeromap = zeromap; } static void _enable_swap_info(struct swap_info_struct *si) { atomic_long_add(si->pages, &nr_swap_pages); total_swap_pages += si->pages; assert_spin_locked(&swap_lock); /* * both lists are plists, and thus priority ordered. * swap_active_head needs to be priority ordered for swapoff(), * which on removal of any swap_info_struct with an auto-assigned * (i.e. negative) priority increments the auto-assigned priority * of any lower-priority swap_info_structs. * swap_avail_head needs to be priority ordered for folio_alloc_swap(), * which allocates swap pages from the highest available priority * swap_info_struct. */ plist_add(&si->list, &swap_active_head); /* Add back to available list */ add_to_avail_list(si, true); } static void enable_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, unsigned long *zeromap) { spin_lock(&swap_lock); spin_lock(&si->lock); setup_swap_info(si, prio, swap_map, cluster_info, zeromap); spin_unlock(&si->lock); spin_unlock(&swap_lock); /* * Finished initializing swap device, now it's safe to reference it. */ percpu_ref_resurrect(&si->users); spin_lock(&swap_lock); spin_lock(&si->lock); _enable_swap_info(si); spin_unlock(&si->lock); spin_unlock(&swap_lock); } static void reinsert_swap_info(struct swap_info_struct *si) { spin_lock(&swap_lock); spin_lock(&si->lock); setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap); _enable_swap_info(si); spin_unlock(&si->lock); spin_unlock(&swap_lock); } static bool __has_usable_swap(void) { return !plist_head_empty(&swap_active_head); } bool has_usable_swap(void) { bool ret; spin_lock(&swap_lock); ret = __has_usable_swap(); spin_unlock(&swap_lock); return ret; } /* * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range * see the updated flags, so there will be no more allocations. */ static void wait_for_allocation(struct swap_info_struct *si) { unsigned long offset; unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER); struct swap_cluster_info *ci; BUG_ON(si->flags & SWP_WRITEOK); for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { ci = lock_cluster(si, offset); unlock_cluster(ci); offset += SWAPFILE_CLUSTER; } } SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; unsigned char *swap_map; unsigned long *zeromap; struct swap_cluster_info *cluster_info; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; struct filename *pathname; int err, found = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; BUG_ON(!current->mm); pathname = getname(specialfile); if (IS_ERR(pathname)) return PTR_ERR(pathname); victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); err = PTR_ERR(victim); if (IS_ERR(victim)) goto out; mapping = victim->f_mapping; spin_lock(&swap_lock); plist_for_each_entry(p, &swap_active_head, list) { if (p->flags & SWP_WRITEOK) { if (p->swap_file->f_mapping == mapping) { found = 1; break; } } } if (!found) { err = -EINVAL; spin_unlock(&swap_lock); goto out_dput; } if (!security_vm_enough_memory_mm(current->mm, p->pages)) vm_unacct_memory(p->pages); else { err = -ENOMEM; spin_unlock(&swap_lock); goto out_dput; } spin_lock(&p->lock); del_from_avail_list(p, true); if (p->prio < 0) { struct swap_info_struct *si = p; int nid; plist_for_each_entry_continue(si, &swap_active_head, list) { si->prio++; si->list.prio--; for_each_node(nid) { if (si->avail_lists[nid].prio != 1) si->avail_lists[nid].prio--; } } least_priority++; } plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; spin_unlock(&p->lock); spin_unlock(&swap_lock); wait_for_allocation(p); disable_swap_slots_cache_lock(); set_current_oom_origin(); err = try_to_unuse(p->type); clear_current_oom_origin(); if (err) { /* re-insert swap space back into swap_list */ reinsert_swap_info(p); reenable_swap_slots_cache_unlock(); goto out_dput; } reenable_swap_slots_cache_unlock(); /* * Wait for swap operations protected by get/put_swap_device() * to complete. Because of synchronize_rcu() here, all swap * operations protected by RCU reader side lock (including any * spinlock) will be waited too. This makes it easy to * prevent folio_test_swapcache() and the following swap cache * operations from racing with swapoff. */ percpu_ref_kill(&p->users); synchronize_rcu(); wait_for_completion(&p->comp); flush_work(&p->discard_work); flush_work(&p->reclaim_work); destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); if (!p->bdev || !bdev_nonrot(p->bdev)) atomic_dec(&nr_rotate_swap); mutex_lock(&swapon_mutex); spin_lock(&swap_lock); spin_lock(&p->lock); drain_mmlist(); swap_file = p->swap_file; p->swap_file = NULL; p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; zeromap = p->zeromap; p->zeromap = NULL; cluster_info = p->cluster_info; p->cluster_info = NULL; spin_unlock(&p->lock); spin_unlock(&swap_lock); arch_swap_invalidate_area(p->type); zswap_swapoff(p->type); mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; kfree(p->global_cluster); p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); kvfree(cluster_info); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); exit_swap_address_space(p->type); inode = mapping->host; inode_lock(inode); inode->i_flags &= ~S_SWAPFILE; inode_unlock(inode); filp_close(swap_file, NULL); /* * Clear the SWP_USED flag after all resources are freed so that swapon * can reuse this swap_info in alloc_swap_info() safely. It is ok to * not hold p->lock after we cleared its SWP_WRITEOK. */ spin_lock(&swap_lock); p->flags = 0; spin_unlock(&swap_lock); err = 0; atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); out_dput: filp_close(victim, NULL); out: putname(pathname); return err; } #ifdef CONFIG_PROC_FS static __poll_t swaps_poll(struct file *file, poll_table *wait) { struct seq_file *seq = file->private_data; poll_wait(file, &proc_poll_wait, wait); if (seq->poll_event != atomic_read(&proc_poll_event)) { seq->poll_event = atomic_read(&proc_poll_event); return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; } return EPOLLIN | EPOLLRDNORM; } /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) { struct swap_info_struct *si; int type; loff_t l = *pos; mutex_lock(&swapon_mutex); if (!l) return SEQ_START_TOKEN; for (type = 0; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) return si; } return NULL; } static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) { struct swap_info_struct *si = v; int type; if (v == SEQ_START_TOKEN) type = 0; else type = si->type + 1; ++(*pos); for (; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; return si; } return NULL; } static void swap_stop(struct seq_file *swap, void *v) { mutex_unlock(&swapon_mutex); } static int swap_show(struct seq_file *swap, void *v) { struct swap_info_struct *si = v; struct file *file; int len; unsigned long bytes, inuse; if (si == SEQ_START_TOKEN) { seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); return 0; } bytes = K(si->pages); inuse = K(swap_usage_in_pages(si)); file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", bytes, bytes < 10000000 ? "\t" : "", inuse, inuse < 10000000 ? "\t" : "", si->prio); return 0; } static const struct seq_operations swaps_op = { .start = swap_start, .next = swap_next, .stop = swap_stop, .show = swap_show }; static int swaps_open(struct inode *inode, struct file *file) { struct seq_file *seq; int ret; ret = seq_open(file, &swaps_op); if (ret) return ret; seq = file->private_data; seq->poll_event = atomic_read(&proc_poll_event); return 0; } static const struct proc_ops swaps_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = swaps_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, .proc_poll = swaps_poll, }; static int __init procswaps_init(void) { proc_create("swaps", 0, NULL, &swaps_proc_ops); return 0; } __initcall(procswaps_init); #endif /* CONFIG_PROC_FS */ #ifdef MAX_SWAPFILES_CHECK static int __init max_swapfiles_check(void) { MAX_SWAPFILES_CHECK(); return 0; } late_initcall(max_swapfiles_check); #endif static struct swap_info_struct *alloc_swap_info(void) { struct swap_info_struct *p; struct swap_info_struct *defer = NULL; unsigned int type; int i; p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); if (percpu_ref_init(&p->users, swap_users_ref_free, PERCPU_REF_INIT_DEAD, GFP_KERNEL)) { kvfree(p); return ERR_PTR(-ENOMEM); } spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { if (!(swap_info[type]->flags & SWP_USED)) break; } if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); percpu_ref_exit(&p->users); kvfree(p); return ERR_PTR(-EPERM); } if (type >= nr_swapfiles) { p->type = type; /* * Publish the swap_info_struct after initializing it. * Note that kvzalloc() above zeroes all its fields. */ smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */ nr_swapfiles++; } else { defer = p; p = swap_info[type]; /* * Do not memset this entry: a racing procfs swap_next() * would be relying on p->type to remain valid. */ } p->swap_extent_root = RB_ROOT; plist_node_init(&p->list, 0); for_each_node(i) plist_node_init(&p->avail_lists[i], 0); p->flags = SWP_USED; spin_unlock(&swap_lock); if (defer) { percpu_ref_exit(&defer->users); kvfree(defer); } spin_lock_init(&p->lock); spin_lock_init(&p->cont_lock); atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT); init_completion(&p->comp); return p; } static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) { if (S_ISBLK(inode->i_mode)) { si->bdev = I_BDEV(inode); /* * Zoned block devices contain zones that have a sequential * write only restriction. Hence zoned block devices are not * suitable for swapping. Disallow them here. */ if (bdev_is_zoned(si->bdev)) return -EINVAL; si->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { si->bdev = inode->i_sb->s_bdev; } return 0; } /* * Find out how many pages are allowed for a single swap device. There * are two limiting factors: * 1) the number of bits for the swap offset in the swp_entry_t type, and * 2) the number of bits in the swap pte, as defined by the different * architectures. * * In order to find the largest possible bit mask, a swap entry with * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, * decoded to a swp_entry_t again, and finally the swap offset is * extracted. * * This will mask all the bits from the initial ~0UL mask that can't * be encoded in either the swp_entry_t or the architecture definition * of a swap pte. */ unsigned long generic_max_swapfile_size(void) { return swp_offset(pte_to_swp_entry( swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; } /* Can be overridden by an architecture for additional checks. */ __weak unsigned long arch_max_swapfile_size(void) { return generic_max_swapfile_size(); } static unsigned long read_swap_header(struct swap_info_struct *si, union swap_header *swap_header, struct inode *inode) { int i; unsigned long maxpages; unsigned long swapfilepages; unsigned long last_page; if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { pr_err("Unable to find swap-space signature\n"); return 0; } /* swap partition endianness hack... */ if (swab32(swap_header->info.version) == 1) { swab32s(&swap_header->info.version); swab32s(&swap_header->info.last_page); swab32s(&swap_header->info.nr_badpages); if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) return 0; for (i = 0; i < swap_header->info.nr_badpages; i++) swab32s(&swap_header->info.badpages[i]); } /* Check the swap header's sub-version */ if (swap_header->info.version != 1) { pr_warn("Unable to handle swap header version %d\n", swap_header->info.version); return 0; } maxpages = swapfile_maximum_size; last_page = swap_header->info.last_page; if (!last_page) { pr_warn("Empty swap-file\n"); return 0; } if (last_page > maxpages) { pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", K(maxpages), K(last_page)); } if (maxpages > last_page) { maxpages = last_page + 1; /* p->max is an unsigned int: don't overflow it */ if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; } if (!maxpages) return 0; swapfilepages = i_size_read(inode) >> PAGE_SHIFT; if (swapfilepages && maxpages > swapfilepages) { pr_warn("Swap area shorter than signature indicates\n"); return 0; } if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) return 0; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) return 0; return maxpages; } #define SWAP_CLUSTER_INFO_COLS \ DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) #define SWAP_CLUSTER_SPACE_COLS \ DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) #define SWAP_CLUSTER_COLS \ max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) static int setup_swap_map_and_extents(struct swap_info_struct *si, union swap_header *swap_header, unsigned char *swap_map, unsigned long maxpages, sector_t *span) { unsigned int nr_good_pages; unsigned long i; int nr_extents; nr_good_pages = maxpages - 1; /* omit header page */ for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr == 0 || page_nr > swap_header->info.last_page) return -EINVAL; if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; nr_good_pages--; } } if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; si->max = maxpages; si->pages = nr_good_pages; nr_extents = setup_swap_extents(si, span); if (nr_extents < 0) return nr_extents; nr_good_pages = si->pages; } if (!nr_good_pages) { pr_warn("Empty swap-file\n"); return -EINVAL; } return nr_extents; } static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, union swap_header *swap_header, unsigned long maxpages) { unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); struct swap_cluster_info *cluster_info; unsigned long i, j, k, idx; int cpu, err = -ENOMEM; cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL); if (!cluster_info) goto err; for (i = 0; i < nr_clusters; i++) spin_lock_init(&cluster_info[i].lock); if (si->flags & SWP_SOLIDSTATE) { si->percpu_cluster = alloc_percpu(struct percpu_cluster); if (!si->percpu_cluster) goto err_free; for_each_possible_cpu(cpu) { struct percpu_cluster *cluster; cluster = per_cpu_ptr(si->percpu_cluster, cpu); for (i = 0; i < SWAP_NR_ORDERS; i++) cluster->next[i] = SWAP_ENTRY_INVALID; local_lock_init(&cluster->lock); } } else { si->global_cluster = kmalloc(sizeof(*si->global_cluster), GFP_KERNEL); if (!si->global_cluster) goto err_free; for (i = 0; i < SWAP_NR_ORDERS; i++) si->global_cluster->next[i] = SWAP_ENTRY_INVALID; spin_lock_init(&si->global_cluster_lock); } /* * Mark unusable pages as unavailable. The clusters aren't * marked free yet, so no list operations are involved yet. * * See setup_swap_map_and_extents(): header page, bad pages, * and the EOF part of the last cluster. */ inc_cluster_info_page(si, cluster_info, 0); for (i = 0; i < swap_header->info.nr_badpages; i++) inc_cluster_info_page(si, cluster_info, swap_header->info.badpages[i]); for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) inc_cluster_info_page(si, cluster_info, i); INIT_LIST_HEAD(&si->free_clusters); INIT_LIST_HEAD(&si->full_clusters); INIT_LIST_HEAD(&si->discard_clusters); for (i = 0; i < SWAP_NR_ORDERS; i++) { INIT_LIST_HEAD(&si->nonfull_clusters[i]); INIT_LIST_HEAD(&si->frag_clusters[i]); atomic_long_set(&si->frag_cluster_nr[i], 0); } /* * Reduce false cache line sharing between cluster_info and * sharing same address space. */ for (k = 0; k < SWAP_CLUSTER_COLS; k++) { j = k % SWAP_CLUSTER_COLS; for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { struct swap_cluster_info *ci; idx = i * SWAP_CLUSTER_COLS + j; ci = cluster_info + idx; if (idx >= nr_clusters) continue; if (ci->count) { ci->flags = CLUSTER_FLAG_NONFULL; list_add_tail(&ci->list, &si->nonfull_clusters[0]); continue; } ci->flags = CLUSTER_FLAG_FREE; list_add_tail(&ci->list, &si->free_clusters); } } return cluster_info; err_free: kvfree(cluster_info); err: return ERR_PTR(err); } SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *si; struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; struct dentry *dentry; int prio; int error; union swap_header *swap_header; int nr_extents; sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; unsigned long *zeromap = NULL; struct swap_cluster_info *cluster_info = NULL; struct folio *folio = NULL; struct inode *inode = NULL; bool inced_nr_rotate_swap = false; if (swap_flags & ~SWAP_FLAGS_VALID) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!swap_avail_heads) return -ENOMEM; si = alloc_swap_info(); if (IS_ERR(si)) return PTR_ERR(si); INIT_WORK(&si->discard_work, swap_discard_work); INIT_WORK(&si->reclaim_work, swap_reclaim_work); name = getname(specialfile); if (IS_ERR(name)) { error = PTR_ERR(name); name = NULL; goto bad_swap; } swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0); if (IS_ERR(swap_file)) { error = PTR_ERR(swap_file); swap_file = NULL; goto bad_swap; } si->swap_file = swap_file; mapping = swap_file->f_mapping; dentry = swap_file->f_path.dentry; inode = mapping->host; error = claim_swapfile(si, inode); if (unlikely(error)) goto bad_swap; inode_lock(inode); if (d_unlinked(dentry) || cant_mount(dentry)) { error = -ENOENT; goto bad_swap_unlock_inode; } if (IS_SWAPFILE(inode)) { error = -EBUSY; goto bad_swap_unlock_inode; } /* * Read the swap header. */ if (!mapping->a_ops->read_folio) { error = -EINVAL; goto bad_swap_unlock_inode; } folio = read_mapping_folio(mapping, 0, swap_file); if (IS_ERR(folio)) { error = PTR_ERR(folio); goto bad_swap_unlock_inode; } swap_header = kmap_local_folio(folio, 0); maxpages = read_swap_header(si, swap_header, inode); if (unlikely(!maxpages)) { error = -EINVAL; goto bad_swap_unlock_inode; } /* OK, set up the swap map and apply the bad block list */ swap_map = vzalloc(maxpages); if (!swap_map) { error = -ENOMEM; goto bad_swap_unlock_inode; } error = swap_cgroup_swapon(si->type, maxpages); if (error) goto bad_swap_unlock_inode; nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map, maxpages, &span); if (unlikely(nr_extents < 0)) { error = nr_extents; goto bad_swap_unlock_inode; } /* * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might * be above MAX_PAGE_ORDER incase of a large swap file. */ zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long), GFP_KERNEL | __GFP_ZERO); if (!zeromap) { error = -ENOMEM; goto bad_swap_unlock_inode; } if (si->bdev && bdev_stable_writes(si->bdev)) si->flags |= SWP_STABLE_WRITES; if (si->bdev && bdev_synchronous(si->bdev)) si->flags |= SWP_SYNCHRONOUS_IO; if (si->bdev && bdev_nonrot(si->bdev)) { si->flags |= SWP_SOLIDSTATE; } else { atomic_inc(&nr_rotate_swap); inced_nr_rotate_swap = true; } cluster_info = setup_clusters(si, swap_header, maxpages); if (IS_ERR(cluster_info)) { error = PTR_ERR(cluster_info); cluster_info = NULL; goto bad_swap_unlock_inode; } if ((swap_flags & SWAP_FLAG_DISCARD) && si->bdev && bdev_max_discard_sectors(si->bdev)) { /* * When discard is enabled for swap with no particular * policy flagged, we set all swap discard flags here in * order to sustain backward compatibility with older * swapon(8) releases. */ si->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | SWP_PAGE_DISCARD); /* * By flagging sys_swapon, a sysadmin can tell us to * either do single-time area discards only, or to just * perform discards for released swap page-clusters. * Now it's time to adjust the p->flags accordingly. */ if (swap_flags & SWAP_FLAG_DISCARD_ONCE) si->flags &= ~SWP_PAGE_DISCARD; else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) si->flags &= ~SWP_AREA_DISCARD; /* issue a swapon-time discard if it's still required */ if (si->flags & SWP_AREA_DISCARD) { int err = discard_swap(si); if (unlikely(err)) pr_err("swapon: discard_swap(%p): %d\n", si, err); } } error = init_swap_address_space(si->type, maxpages); if (error) goto bad_swap_unlock_inode; error = zswap_swapon(si->type, maxpages); if (error) goto free_swap_address_space; /* * Flush any pending IO and dirty mappings before we start using this * swap device. */ inode->i_flags |= S_SWAPFILE; error = inode_drain_writes(inode); if (error) { inode->i_flags &= ~S_SWAPFILE; goto free_swap_zswap; } mutex_lock(&swapon_mutex); prio = -1; if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; enable_swap_info(si, prio, swap_map, cluster_info, zeromap); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", K(si->pages), name->name, si->prio, nr_extents, K((unsigned long long)span), (si->flags & SWP_SOLIDSTATE) ? "SS" : "", (si->flags & SWP_DISCARDABLE) ? "D" : "", (si->flags & SWP_AREA_DISCARD) ? "s" : "", (si->flags & SWP_PAGE_DISCARD) ? "c" : ""); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); error = 0; goto out; free_swap_zswap: zswap_swapoff(si->type); free_swap_address_space: exit_swap_address_space(si->type); bad_swap_unlock_inode: inode_unlock(inode); bad_swap: free_percpu(si->percpu_cluster); si->percpu_cluster = NULL; kfree(si->global_cluster); si->global_cluster = NULL; inode = NULL; destroy_swap_extents(si); swap_cgroup_swapoff(si->type); spin_lock(&swap_lock); si->swap_file = NULL; si->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); kvfree(zeromap); kvfree(cluster_info); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) filp_close(swap_file, NULL); out: if (!IS_ERR_OR_NULL(folio)) folio_release_kmap(folio, swap_header); if (name) putname(name); if (inode) inode_unlock(inode); if (!error) enable_swap_slots_cache(); return error; } void si_swapinfo(struct sysinfo *val) { unsigned int type; unsigned long nr_to_be_unused = 0; spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { struct swap_info_struct *si = swap_info[type]; if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) nr_to_be_unused += swap_usage_in_pages(si); } val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; spin_unlock(&swap_lock); } /* * Verify that nr swap entries are valid and increment their swap map counts. * * Returns error code in following case. * - success -> 0 * - swp_entry is invalid -> EINVAL * - swp_entry is migration entry -> EINVAL * - swap-cache reference is requested but there is already one. -> EEXIST * - swap-cache reference is requested but the entry is not used. -> ENOENT * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) { struct swap_info_struct *si; struct swap_cluster_info *ci; unsigned long offset; unsigned char count; unsigned char has_cache; int err, i; si = swp_swap_info(entry); offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); VM_WARN_ON(usage == 1 && nr > 1); ci = lock_cluster(si, offset); err = 0; for (i = 0; i < nr; i++) { count = si->swap_map[offset + i]; /* * swapin_readahead() doesn't check if a swap entry is valid, so the * swap entry could be SWAP_MAP_BAD. Check here with lock held. */ if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { err = -ENOENT; goto unlock_out; } has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; if (!count && !has_cache) { err = -ENOENT; } else if (usage == SWAP_HAS_CACHE) { if (has_cache) err = -EEXIST; } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) { err = -EINVAL; } if (err) goto unlock_out; } for (i = 0; i < nr; i++) { count = si->swap_map[offset + i]; has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; if (usage == SWAP_HAS_CACHE) has_cache = SWAP_HAS_CACHE; else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) count += usage; else if (swap_count_continued(si, offset + i, count)) count = COUNT_CONTINUED; else { /* * Don't need to rollback changes, because if * usage == 1, there must be nr == 1. */ err = -ENOMEM; goto unlock_out; } WRITE_ONCE(si->swap_map[offset + i], count | has_cache); } unlock_out: unlock_cluster(ci); return err; } /* * Help swapoff by noting that swap entry belongs to shmem/tmpfs * (in which case its reference count is never incremented). */ void swap_shmem_alloc(swp_entry_t entry, int nr) { __swap_duplicate(entry, SWAP_MAP_SHMEM, nr); } /* * Increase reference count of swap entry by 1. * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required * but could not be atomically allocated. Returns 0, just as if it succeeded, * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which * might occur if a page table entry has got corrupted. */ int swap_duplicate(swp_entry_t entry) { int err = 0; while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) err = add_swap_count_continuation(entry, GFP_ATOMIC); return err; } /* * @entry: first swap entry from which we allocate nr swap cache. * * Called when allocating swap cache for existing swap entries, * This can return error codes. Returns 0 at success. * -EEXIST means there is a swap cache. * Note: return code is different from swap_duplicate(). */ int swapcache_prepare(swp_entry_t entry, int nr) { return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); } void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) { return swap_type_to_swap_info(swp_type(entry)); } /* * out-of-line methods to avoid include hell. */ struct address_space *swapcache_mapping(struct folio *folio) { return swp_swap_info(folio->swap)->swap_file->f_mapping; } EXPORT_SYMBOL_GPL(swapcache_mapping); pgoff_t __folio_swap_cache_index(struct folio *folio) { return swap_cache_index(folio->swap); } EXPORT_SYMBOL_GPL(__folio_swap_cache_index); /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's * page of the original vmalloc'ed swap_map, to hold the continuation count * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. * * These continuation pages are seldom referenced: the common paths all work * on the original swap_map, only referring to a continuation page when the * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. * * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) * can be called after dropping locks. */ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) { struct swap_info_struct *si; struct swap_cluster_info *ci; struct page *head; struct page *page; struct page *list_page; pgoff_t offset; unsigned char count; int ret = 0; /* * When debugging, it's easier to use __GFP_ZERO here; but it's better * for latency not to zero a page while GFP_ATOMIC and holding locks. */ page = alloc_page(gfp_mask | __GFP_HIGHMEM); si = get_swap_device(entry); if (!si) { /* * An acceptable race has occurred since the failing * __swap_duplicate(): the swap device may be swapoff */ goto outer; } offset = swp_offset(entry); ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { /* * The higher the swap count, the more likely it is that tasks * will race to add swap count continuation: we need to avoid * over-provisioning. */ goto out; } if (!page) { ret = -ENOMEM; goto out; } head = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; spin_lock(&si->cont_lock); /* * Page allocation does not initialize the page's lru field, * but it does always reset its private field. */ if (!page_private(head)) { BUG_ON(count & COUNT_CONTINUED); INIT_LIST_HEAD(&head->lru); set_page_private(head, SWP_CONTINUED); si->flags |= SWP_CONTINUED; } list_for_each_entry(list_page, &head->lru, lru) { unsigned char *map; /* * If the previous map said no continuation, but we've found * a continuation page, free our allocation and use this one. */ if (!(count & COUNT_CONTINUED)) goto out_unlock_cont; map = kmap_local_page(list_page) + offset; count = *map; kunmap_local(map); /* * If this continuation count now has some space in it, * free our allocation and use this one. */ if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) goto out_unlock_cont; } list_add_tail(&page->lru, &head->lru); page = NULL; /* now it's attached, don't free it */ out_unlock_cont: spin_unlock(&si->cont_lock); out: unlock_cluster(ci); put_swap_device(si); outer: if (page) __free_page(page); return ret; } /* * swap_count_continued - when the original swap_map count is incremented * from SWAP_MAP_MAX, check if there is already a continuation page to carry * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster * lock. */ static bool swap_count_continued(struct swap_info_struct *si, pgoff_t offset, unsigned char count) { struct page *head; struct page *page; unsigned char *map; bool ret; head = vmalloc_to_page(si->swap_map + offset); if (page_private(head) != SWP_CONTINUED) { BUG_ON(count & COUNT_CONTINUED); return false; /* need to add count continuation */ } spin_lock(&si->cont_lock); offset &= ~PAGE_MASK; page = list_next_entry(head, lru); map = kmap_local_page(page) + offset; if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ goto init_map; /* jump over SWAP_CONT_MAX checks */ if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ /* * Think of how you add 1 to 999 */ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { kunmap_local(map); page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_local_page(page) + offset; } if (*map == SWAP_CONT_MAX) { kunmap_local(map); page = list_next_entry(page, lru); if (page == head) { ret = false; /* add count continuation */ goto out; } map = kmap_local_page(page) + offset; init_map: *map = 0; /* we didn't zero the page */ } *map += 1; kunmap_local(map); while ((page = list_prev_entry(page, lru)) != head) { map = kmap_local_page(page) + offset; *map = COUNT_CONTINUED; kunmap_local(map); } ret = true; /* incremented */ } else { /* decrementing */ /* * Think of how you subtract 1 from 1000 */ BUG_ON(count != COUNT_CONTINUED); while (*map == COUNT_CONTINUED) { kunmap_local(map); page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_local_page(page) + offset; } BUG_ON(*map == 0); *map -= 1; if (*map == 0) count = 0; kunmap_local(map); while ((page = list_prev_entry(page, lru)) != head) { map = kmap_local_page(page) + offset; *map = SWAP_CONT_MAX | count; count = COUNT_CONTINUED; kunmap_local(map); } ret = count == COUNT_CONTINUED; } out: spin_unlock(&si->cont_lock); return ret; } /* * free_swap_count_continuations - swapoff free all the continuation pages * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. */ static void free_swap_count_continuations(struct swap_info_struct *si) { pgoff_t offset; for (offset = 0; offset < si->max; offset += PAGE_SIZE) { struct page *head; head = vmalloc_to_page(si->swap_map + offset); if (page_private(head)) { struct page *page, *next; list_for_each_entry_safe(page, next, &head->lru, lru) { list_del(&page->lru); __free_page(page); } } } } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { struct swap_info_struct *si, *next; int nid = folio_nid(folio); if (!(gfp & __GFP_IO)) return; if (!__has_usable_swap()) return; if (!blk_cgroup_congested()) return; /* * We've already scheduled a throttle, avoid taking the global swap * lock. */ if (current->throttle_disk) return; spin_lock(&swap_avail_lock); plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) { if (si->bdev) { blkcg_schedule_throttle(si->bdev->bd_disk, true); break; } } spin_unlock(&swap_avail_lock); } #endif static int __init swapfile_init(void) { int nid; swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), GFP_KERNEL); if (!swap_avail_heads) { pr_emerg("Not enough memory for swap heads, swap is disabled\n"); return -ENOMEM; } for_each_node(nid) plist_head_init(&swap_avail_heads[nid]); swapfile_maximum_size = arch_max_swapfile_size(); #ifdef CONFIG_MIGRATION if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) swap_migration_ad_supported = true; #endif /* CONFIG_MIGRATION */ return 0; } subsys_initcall(swapfile_init); |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | // SPDX-License-Identifier: GPL-2.0-only /* * ebtable_nat * * Authors: * Bart De Schuymer <bdschuym@pandora.be> * * April, 2002 * */ #include <linux/netfilter_bridge/ebtables.h> #include <uapi/linux/netfilter_bridge.h> #include <linux/module.h> #define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \ (1 << NF_BR_POST_ROUTING)) static struct ebt_entries initial_chains[] = { { .name = "PREROUTING", .policy = EBT_ACCEPT, }, { .name = "OUTPUT", .policy = EBT_ACCEPT, }, { .name = "POSTROUTING", .policy = EBT_ACCEPT, } }; static struct ebt_replace_kernel initial_table = { .name = "nat", .valid_hooks = NAT_VALID_HOOKS, .entries_size = 3 * sizeof(struct ebt_entries), .hook_entry = { [NF_BR_PRE_ROUTING] = &initial_chains[0], [NF_BR_LOCAL_OUT] = &initial_chains[1], [NF_BR_POST_ROUTING] = &initial_chains[2], }, .entries = (char *)initial_chains, }; static const struct ebt_table frame_nat = { .name = "nat", .table = &initial_table, .valid_hooks = NAT_VALID_HOOKS, .me = THIS_MODULE, }; static const struct nf_hook_ops ebt_ops_nat[] = { { .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_OUT, .priority = NF_BR_PRI_NAT_DST_OTHER, }, { .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_POST_ROUTING, .priority = NF_BR_PRI_NAT_SRC, }, { .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_PRE_ROUTING, .priority = NF_BR_PRI_NAT_DST_BRIDGED, }, }; static int frame_nat_table_init(struct net *net) { return ebt_register_table(net, &frame_nat, ebt_ops_nat); } static void __net_exit frame_nat_net_pre_exit(struct net *net) { ebt_unregister_table_pre_exit(net, "nat"); } static void __net_exit frame_nat_net_exit(struct net *net) { ebt_unregister_table(net, "nat"); } static struct pernet_operations frame_nat_net_ops = { .exit = frame_nat_net_exit, .pre_exit = frame_nat_net_pre_exit, }; static int __init ebtable_nat_init(void) { int ret = ebt_register_template(&frame_nat, frame_nat_table_init); if (ret) return ret; ret = register_pernet_subsys(&frame_nat_net_ops); if (ret) { ebt_unregister_template(&frame_nat); return ret; } return ret; } static void __exit ebtable_nat_fini(void) { unregister_pernet_subsys(&frame_nat_net_ops); ebt_unregister_template(&frame_nat); } module_init(ebtable_nat_init); module_exit(ebtable_nat_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ebtables legacy stateless nat table"); |
| 6 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM compaction #if !defined(_TRACE_COMPACTION_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_COMPACTION_H #include <linux/types.h> #include <linux/list.h> #include <linux/tracepoint.h> #include <trace/events/mmflags.h> DECLARE_EVENT_CLASS(mm_compaction_isolate_template, TP_PROTO( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_scanned, unsigned long nr_taken), TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken), TP_STRUCT__entry( __field(unsigned long, start_pfn) __field(unsigned long, end_pfn) __field(unsigned long, nr_scanned) __field(unsigned long, nr_taken) ), TP_fast_assign( __entry->start_pfn = start_pfn; __entry->end_pfn = end_pfn; __entry->nr_scanned = nr_scanned; __entry->nr_taken = nr_taken; ), TP_printk("range=(0x%lx ~ 0x%lx) nr_scanned=%lu nr_taken=%lu", __entry->start_pfn, __entry->end_pfn, __entry->nr_scanned, __entry->nr_taken) ); DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages, TP_PROTO( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_scanned, unsigned long nr_taken), TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, TP_PROTO( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_scanned, unsigned long nr_taken), TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_fast_isolate_freepages, TP_PROTO( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_scanned, unsigned long nr_taken), TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); #ifdef CONFIG_COMPACTION TRACE_EVENT(mm_compaction_migratepages, TP_PROTO(unsigned int nr_migratepages, unsigned int nr_succeeded), TP_ARGS(nr_migratepages, nr_succeeded), TP_STRUCT__entry( __field(unsigned long, nr_migrated) __field(unsigned long, nr_failed) ), TP_fast_assign( __entry->nr_migrated = nr_succeeded; __entry->nr_failed = nr_migratepages - nr_succeeded; ), TP_printk("nr_migrated=%lu nr_failed=%lu", __entry->nr_migrated, __entry->nr_failed) ); TRACE_EVENT(mm_compaction_begin, TP_PROTO(struct compact_control *cc, unsigned long zone_start, unsigned long zone_end, bool sync), TP_ARGS(cc, zone_start, zone_end, sync), TP_STRUCT__entry( __field(unsigned long, zone_start) __field(unsigned long, migrate_pfn) __field(unsigned long, free_pfn) __field(unsigned long, zone_end) __field(bool, sync) ), TP_fast_assign( __entry->zone_start = zone_start; __entry->migrate_pfn = cc->migrate_pfn; __entry->free_pfn = cc->free_pfn; __entry->zone_end = zone_end; __entry->sync = sync; ), TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s", __entry->zone_start, __entry->migrate_pfn, __entry->free_pfn, __entry->zone_end, __entry->sync ? "sync" : "async") ); TRACE_EVENT(mm_compaction_end, TP_PROTO(struct compact_control *cc, unsigned long zone_start, unsigned long zone_end, bool sync, int status), TP_ARGS(cc, zone_start, zone_end, sync, status), TP_STRUCT__entry( __field(unsigned long, zone_start) __field(unsigned long, migrate_pfn) __field(unsigned long, free_pfn) __field(unsigned long, zone_end) __field(bool, sync) __field(int, status) ), TP_fast_assign( __entry->zone_start = zone_start; __entry->migrate_pfn = cc->migrate_pfn; __entry->free_pfn = cc->free_pfn; __entry->zone_end = zone_end; __entry->sync = sync; __entry->status = status; ), TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s status=%s", __entry->zone_start, __entry->migrate_pfn, __entry->free_pfn, __entry->zone_end, __entry->sync ? "sync" : "async", __print_symbolic(__entry->status, COMPACTION_STATUS)) ); TRACE_EVENT(mm_compaction_try_to_compact_pages, TP_PROTO( int order, gfp_t gfp_mask, int prio), TP_ARGS(order, gfp_mask, prio), TP_STRUCT__entry( __field(int, order) __field(unsigned long, gfp_mask) __field(int, prio) ), TP_fast_assign( __entry->order = order; __entry->gfp_mask = (__force unsigned long)gfp_mask; __entry->prio = prio; ), TP_printk("order=%d gfp_mask=%s priority=%d", __entry->order, show_gfp_flags(__entry->gfp_mask), __entry->prio) ); DECLARE_EVENT_CLASS(mm_compaction_suitable_template, TP_PROTO(struct zone *zone, int order, int ret), TP_ARGS(zone, order, ret), TP_STRUCT__entry( __field(int, nid) __field(enum zone_type, idx) __field(int, order) __field(int, ret) ), TP_fast_assign( __entry->nid = zone_to_nid(zone); __entry->idx = zone_idx(zone); __entry->order = order; __entry->ret = ret; ), TP_printk("node=%d zone=%-8s order=%d ret=%s", __entry->nid, __print_symbolic(__entry->idx, ZONE_TYPE), __entry->order, __print_symbolic(__entry->ret, COMPACTION_STATUS)) ); DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished, TP_PROTO(struct zone *zone, int order, int ret), TP_ARGS(zone, order, ret) ); DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable, TP_PROTO(struct zone *zone, int order, int ret), TP_ARGS(zone, order, ret) ); DECLARE_EVENT_CLASS(mm_compaction_defer_template, TP_PROTO(struct zone *zone, int order), TP_ARGS(zone, order), TP_STRUCT__entry( __field(int, nid) __field(enum zone_type, idx) __field(int, order) __field(unsigned int, considered) __field(unsigned int, defer_shift) __field(int, order_failed) ), TP_fast_assign( __entry->nid = zone_to_nid(zone); __entry->idx = zone_idx(zone); __entry->order = order; __entry->considered = zone->compact_considered; __entry->defer_shift = zone->compact_defer_shift; __entry->order_failed = zone->compact_order_failed; ), TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu", __entry->nid, __print_symbolic(__entry->idx, ZONE_TYPE), __entry->order, __entry->order_failed, __entry->considered, 1UL << __entry->defer_shift) ); DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_deferred, TP_PROTO(struct zone *zone, int order), TP_ARGS(zone, order) ); DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_compaction, TP_PROTO(struct zone *zone, int order), TP_ARGS(zone, order) ); DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset, TP_PROTO(struct zone *zone, int order), TP_ARGS(zone, order) ); TRACE_EVENT(mm_compaction_kcompactd_sleep, TP_PROTO(int nid), TP_ARGS(nid), TP_STRUCT__entry( __field(int, nid) ), TP_fast_assign( __entry->nid = nid; ), TP_printk("nid=%d", __entry->nid) ); DECLARE_EVENT_CLASS(kcompactd_wake_template, TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), TP_ARGS(nid, order, highest_zoneidx), TP_STRUCT__entry( __field(int, nid) __field(int, order) __field(enum zone_type, highest_zoneidx) ), TP_fast_assign( __entry->nid = nid; __entry->order = order; __entry->highest_zoneidx = highest_zoneidx; ), /* * classzone_idx is previous name of the highest_zoneidx. * Reason not to change it is the ABI requirement of the tracepoint. */ TP_printk("nid=%d order=%d classzone_idx=%-8s", __entry->nid, __entry->order, __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE)) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd, TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), TP_ARGS(nid, order, highest_zoneidx) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake, TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), TP_ARGS(nid, order, highest_zoneidx) ); #endif #endif /* _TRACE_COMPACTION_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 16 6 6 8 11 3 2 2 3 1 4 4 1 1 1 1 1 1 39 39 16 39 14 14 14 14 7 7 7 7 3 9 1 1 10 4 1 1 1 1 5 1 2 2 1 1 2 1 1 3 3 1 1 38 27 2 1 1 1 1 1 14 14 1 1 1 4 1 5 5 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 | // SPDX-License-Identifier: GPL-2.0+ /* * HID driver for UC-Logic devices not fully compliant with HID standard * - tablet initialization and parameter retrieval * * Copyright (c) 2018 Nikolai Kondrashov */ /* * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. */ #include "hid-uclogic-params.h" #include "hid-uclogic-rdesc.h" #include "usbhid/usbhid.h" #include "hid-ids.h" #include <linux/ctype.h> #include <linux/string.h> #include <linux/unaligned.h> /** * uclogic_params_pen_inrange_to_str() - Convert a pen in-range reporting type * to a string. * @inrange: The in-range reporting type to convert. * * Return: * * The string representing the type, or * * %NULL if the type is unknown. */ static const char *uclogic_params_pen_inrange_to_str( enum uclogic_params_pen_inrange inrange) { switch (inrange) { case UCLOGIC_PARAMS_PEN_INRANGE_NORMAL: return "normal"; case UCLOGIC_PARAMS_PEN_INRANGE_INVERTED: return "inverted"; case UCLOGIC_PARAMS_PEN_INRANGE_NONE: return "none"; default: return NULL; } } /** * uclogic_params_pen_hid_dbg() - Dump tablet interface pen parameters * @hdev: The HID device the pen parameters describe. * @pen: The pen parameters to dump. * * Dump tablet interface pen parameters with hid_dbg(). The dump is indented * with a tab. */ static void uclogic_params_pen_hid_dbg(const struct hid_device *hdev, const struct uclogic_params_pen *pen) { size_t i; hid_dbg(hdev, "\t.usage_invalid = %s\n", (pen->usage_invalid ? "true" : "false")); hid_dbg(hdev, "\t.desc_ptr = %p\n", pen->desc_ptr); hid_dbg(hdev, "\t.desc_size = %u\n", pen->desc_size); hid_dbg(hdev, "\t.id = %u\n", pen->id); hid_dbg(hdev, "\t.subreport_list = {\n"); for (i = 0; i < ARRAY_SIZE(pen->subreport_list); i++) { hid_dbg(hdev, "\t\t{0x%02hhx, %hhu}%s\n", pen->subreport_list[i].value, pen->subreport_list[i].id, i < (ARRAY_SIZE(pen->subreport_list) - 1) ? "," : ""); } hid_dbg(hdev, "\t}\n"); hid_dbg(hdev, "\t.inrange = %s\n", uclogic_params_pen_inrange_to_str(pen->inrange)); hid_dbg(hdev, "\t.fragmented_hires = %s\n", (pen->fragmented_hires ? "true" : "false")); hid_dbg(hdev, "\t.tilt_y_flipped = %s\n", (pen->tilt_y_flipped ? "true" : "false")); } /** * uclogic_params_frame_hid_dbg() - Dump tablet interface frame parameters * @hdev: The HID device the pen parameters describe. * @frame: The frame parameters to dump. * * Dump tablet interface frame parameters with hid_dbg(). The dump is * indented with two tabs. */ static void uclogic_params_frame_hid_dbg( const struct hid_device *hdev, const struct uclogic_params_frame *frame) { hid_dbg(hdev, "\t\t.desc_ptr = %p\n", frame->desc_ptr); hid_dbg(hdev, "\t\t.desc_size = %u\n", frame->desc_size); hid_dbg(hdev, "\t\t.id = %u\n", frame->id); hid_dbg(hdev, "\t\t.suffix = %s\n", frame->suffix); hid_dbg(hdev, "\t\t.re_lsb = %u\n", frame->re_lsb); hid_dbg(hdev, "\t\t.dev_id_byte = %u\n", frame->dev_id_byte); hid_dbg(hdev, "\t\t.touch_byte = %u\n", frame->touch_byte); hid_dbg(hdev, "\t\t.touch_max = %hhd\n", frame->touch_max); hid_dbg(hdev, "\t\t.touch_flip_at = %hhd\n", frame->touch_flip_at); hid_dbg(hdev, "\t\t.bitmap_dial_byte = %u\n", frame->bitmap_dial_byte); } /** * uclogic_params_hid_dbg() - Dump tablet interface parameters * @hdev: The HID device the parameters describe. * @params: The parameters to dump. * * Dump tablet interface parameters with hid_dbg(). */ void uclogic_params_hid_dbg(const struct hid_device *hdev, const struct uclogic_params *params) { size_t i; hid_dbg(hdev, ".invalid = %s\n", params->invalid ? "true" : "false"); hid_dbg(hdev, ".desc_ptr = %p\n", params->desc_ptr); hid_dbg(hdev, ".desc_size = %u\n", params->desc_size); hid_dbg(hdev, ".pen = {\n"); uclogic_params_pen_hid_dbg(hdev, ¶ms->pen); hid_dbg(hdev, "\t}\n"); hid_dbg(hdev, ".frame_list = {\n"); for (i = 0; i < ARRAY_SIZE(params->frame_list); i++) { hid_dbg(hdev, "\t{\n"); uclogic_params_frame_hid_dbg(hdev, ¶ms->frame_list[i]); hid_dbg(hdev, "\t}%s\n", i < (ARRAY_SIZE(params->frame_list) - 1) ? "," : ""); } hid_dbg(hdev, "}\n"); } /** * uclogic_params_get_str_desc - retrieve a string descriptor from a HID * device interface, putting it into a kmalloc-allocated buffer as is, without * character encoding conversion. * * @pbuf: Location for the kmalloc-allocated buffer pointer containing * the retrieved descriptor. Not modified in case of error. * Can be NULL to have retrieved descriptor discarded. * @hdev: The HID device of the tablet interface to retrieve the string * descriptor from. Cannot be NULL. * @idx: Index of the string descriptor to request from the device. * @len: Length of the buffer to allocate and the data to retrieve. * * Returns: * number of bytes retrieved (<= len), * -EPIPE, if the descriptor was not found, or * another negative errno code in case of other error. */ static int uclogic_params_get_str_desc(__u8 **pbuf, struct hid_device *hdev, __u8 idx, size_t len) { int rc; struct usb_device *udev; __u8 *buf = NULL; /* Check arguments */ if (hdev == NULL) { rc = -EINVAL; goto cleanup; } udev = hid_to_usb_dev(hdev); buf = kmalloc(len, GFP_KERNEL); if (buf == NULL) { rc = -ENOMEM; goto cleanup; } rc = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), USB_REQ_GET_DESCRIPTOR, USB_DIR_IN, (USB_DT_STRING << 8) + idx, 0x0409, buf, len, USB_CTRL_GET_TIMEOUT); if (rc == -EPIPE) { hid_dbg(hdev, "string descriptor #%hhu not found\n", idx); goto cleanup; } else if (rc < 0) { hid_err(hdev, "failed retrieving string descriptor #%u: %d\n", idx, rc); goto cleanup; } if (pbuf != NULL) { *pbuf = buf; buf = NULL; } cleanup: kfree(buf); return rc; } /** * uclogic_params_pen_cleanup - free resources used by struct * uclogic_params_pen (tablet interface's pen input parameters). * Can be called repeatedly. * * @pen: Pen input parameters to cleanup. Cannot be NULL. */ static void uclogic_params_pen_cleanup(struct uclogic_params_pen *pen) { kfree(pen->desc_ptr); memset(pen, 0, sizeof(*pen)); } /** * uclogic_params_pen_init_v1() - initialize tablet interface pen * input and retrieve its parameters from the device, using v1 protocol. * * @pen: Pointer to the pen parameters to initialize (to be * cleaned up with uclogic_params_pen_cleanup()). Not modified in * case of error, or if parameters are not found. Cannot be NULL. * @pfound: Location for a flag which is set to true if the parameters * were found, and to false if not (e.g. device was * incompatible). Not modified in case of error. Cannot be NULL. * @hdev: The HID device of the tablet interface to initialize and get * parameters from. Cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_pen_init_v1(struct uclogic_params_pen *pen, bool *pfound, struct hid_device *hdev) { int rc; bool found = false; /* Buffer for (part of) the string descriptor */ __u8 *buf = NULL; /* Minimum descriptor length required, maximum seen so far is 18 */ const int len = 12; s32 resolution; /* Pen report descriptor template parameters */ s32 desc_params[UCLOGIC_RDESC_PH_ID_NUM]; __u8 *desc_ptr = NULL; /* Check arguments */ if (pen == NULL || pfound == NULL || hdev == NULL) { rc = -EINVAL; goto cleanup; } /* * Read string descriptor containing pen input parameters. * The specific string descriptor and data were discovered by sniffing * the Windows driver traffic. * NOTE: This enables fully-functional tablet mode. */ rc = uclogic_params_get_str_desc(&buf, hdev, 100, len); if (rc == -EPIPE) { hid_dbg(hdev, "string descriptor with pen parameters not found, assuming not compatible\n"); goto finish; } else if (rc < 0) { hid_err(hdev, "failed retrieving pen parameters: %d\n", rc); goto cleanup; } else if (rc != len) { hid_dbg(hdev, "string descriptor with pen parameters has invalid length (got %d, expected %d), assuming not compatible\n", rc, len); goto finish; } /* * Fill report descriptor parameters from the string descriptor */ desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_LM] = get_unaligned_le16(buf + 2); desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_LM] = get_unaligned_le16(buf + 4); desc_params[UCLOGIC_RDESC_PEN_PH_ID_PRESSURE_LM] = get_unaligned_le16(buf + 8); resolution = get_unaligned_le16(buf + 10); if (resolution == 0) { desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_PM] = 0; desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_PM] = 0; } else { desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_PM] = desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_LM] * 1000 / resolution; desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_PM] = desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_LM] * 1000 / resolution; } kfree(buf); buf = NULL; /* * Generate pen report descriptor */ desc_ptr = uclogic_rdesc_template_apply( uclogic_rdesc_v1_pen_template_arr, uclogic_rdesc_v1_pen_template_size, desc_params, ARRAY_SIZE(desc_params)); if (desc_ptr == NULL) { rc = -ENOMEM; goto cleanup; } /* * Fill-in the parameters */ memset(pen, 0, sizeof(*pen)); pen->desc_ptr = desc_ptr; desc_ptr = NULL; pen->desc_size = uclogic_rdesc_v1_pen_template_size; pen->id = UCLOGIC_RDESC_V1_PEN_ID; pen->inrange = UCLOGIC_PARAMS_PEN_INRANGE_INVERTED; found = true; finish: *pfound = found; rc = 0; cleanup: kfree(desc_ptr); kfree(buf); return rc; } /** * uclogic_params_get_le24() - get a 24-bit little-endian number from a * buffer. * * @p: The pointer to the number buffer. * * Returns: * The retrieved number */ static s32 uclogic_params_get_le24(const void *p) { const __u8 *b = p; return b[0] | (b[1] << 8UL) | (b[2] << 16UL); } /** * uclogic_params_pen_init_v2() - initialize tablet interface pen * input and retrieve its parameters from the device, using v2 protocol. * * @pen: Pointer to the pen parameters to initialize (to be * cleaned up with uclogic_params_pen_cleanup()). Not * modified in case of error, or if parameters are not * found. Cannot be NULL. * @pfound: Location for a flag which is set to true if the * parameters were found, and to false if not (e.g. * device was incompatible). Not modified in case of * error. Cannot be NULL. * @pparams_ptr: Location for a kmalloc'ed pointer to the retrieved raw * parameters, which could be used to identify the tablet * to some extent. Should be freed with kfree after use. * NULL, if not needed. Not modified in case of error. * Only set if *pfound is set to true. * @pparams_len: Location for the length of the retrieved raw * parameters. NULL, if not needed. Not modified in case * of error. Only set if *pfound is set to true. * @hdev: The HID device of the tablet interface to initialize * and get parameters from. Cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_pen_init_v2(struct uclogic_params_pen *pen, bool *pfound, __u8 **pparams_ptr, size_t *pparams_len, struct hid_device *hdev) { int rc; bool found = false; /* Buffer for (part of) the parameter string descriptor */ __u8 *buf = NULL; /* Parameter string descriptor required length */ const int params_len_min = 18; /* Parameter string descriptor accepted length */ const int params_len_max = 32; /* Parameter string descriptor received length */ int params_len; size_t i; s32 resolution; /* Pen report descriptor template parameters */ s32 desc_params[UCLOGIC_RDESC_PH_ID_NUM]; __u8 *desc_ptr = NULL; /* Check arguments */ if (pen == NULL || pfound == NULL || hdev == NULL) { rc = -EINVAL; goto cleanup; } /* * Read string descriptor containing pen input parameters. * The specific string descriptor and data were discovered by sniffing * the Windows driver traffic. * NOTE: This enables fully-functional tablet mode. */ rc = uclogic_params_get_str_desc(&buf, hdev, 200, params_len_max); if (rc == -EPIPE) { hid_dbg(hdev, "string descriptor with pen parameters not found, assuming not compatible\n"); goto finish; } else if (rc < 0) { hid_err(hdev, "failed retrieving pen parameters: %d\n", rc); goto cleanup; } else if (rc < params_len_min) { hid_dbg(hdev, "string descriptor with pen parameters is too short (got %d, expected at least %d), assuming not compatible\n", rc, params_len_min); goto finish; } params_len = rc; /* * Check it's not just a catch-all UTF-16LE-encoded ASCII * string (such as the model name) some tablets put into all * unknown string descriptors. */ for (i = 2; i < params_len && (buf[i] >= 0x20 && buf[i] < 0x7f && buf[i + 1] == 0); i += 2); if (i >= params_len) { hid_dbg(hdev, "string descriptor with pen parameters seems to contain only text, assuming not compatible\n"); goto finish; } /* * Fill report descriptor parameters from the string descriptor */ desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_LM] = uclogic_params_get_le24(buf + 2); desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_LM] = uclogic_params_get_le24(buf + 5); desc_params[UCLOGIC_RDESC_PEN_PH_ID_PRESSURE_LM] = get_unaligned_le16(buf + 8); resolution = get_unaligned_le16(buf + 10); if (resolution == 0) { desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_PM] = 0; desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_PM] = 0; } else { desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_PM] = desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_LM] * 1000 / resolution; desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_PM] = desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_LM] * 1000 / resolution; } /* * Generate pen report descriptor */ desc_ptr = uclogic_rdesc_template_apply( uclogic_rdesc_v2_pen_template_arr, uclogic_rdesc_v2_pen_template_size, desc_params, ARRAY_SIZE(desc_params)); if (desc_ptr == NULL) { rc = -ENOMEM; goto cleanup; } /* * Fill-in the parameters */ memset(pen, 0, sizeof(*pen)); pen->desc_ptr = desc_ptr; desc_ptr = NULL; pen->desc_size = uclogic_rdesc_v2_pen_template_size; pen->id = UCLOGIC_RDESC_V2_PEN_ID; pen->inrange = UCLOGIC_PARAMS_PEN_INRANGE_NONE; pen->fragmented_hires = true; pen->tilt_y_flipped = true; found = true; if (pparams_ptr != NULL) { *pparams_ptr = buf; buf = NULL; } if (pparams_len != NULL) *pparams_len = params_len; finish: *pfound = found; rc = 0; cleanup: kfree(desc_ptr); kfree(buf); return rc; } /** * uclogic_params_frame_cleanup - free resources used by struct * uclogic_params_frame (tablet interface's frame controls input parameters). * Can be called repeatedly. * * @frame: Frame controls input parameters to cleanup. Cannot be NULL. */ static void uclogic_params_frame_cleanup(struct uclogic_params_frame *frame) { kfree(frame->desc_ptr); memset(frame, 0, sizeof(*frame)); } /** * uclogic_params_frame_init_with_desc() - initialize tablet's frame control * parameters with a static report descriptor. * * @frame: Pointer to the frame parameters to initialize (to be cleaned * up with uclogic_params_frame_cleanup()). Not modified in case * of error. Cannot be NULL. * @desc_ptr: Report descriptor pointer. Can be NULL, if desc_size is zero. * @desc_size: Report descriptor size. * @id: Report ID used for frame reports, if they should be tweaked, * zero if not. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_frame_init_with_desc( struct uclogic_params_frame *frame, const __u8 *desc_ptr, size_t desc_size, unsigned int id) { __u8 *copy_desc_ptr; if (frame == NULL || (desc_ptr == NULL && desc_size != 0)) return -EINVAL; copy_desc_ptr = kmemdup(desc_ptr, desc_size, GFP_KERNEL); if (copy_desc_ptr == NULL) return -ENOMEM; memset(frame, 0, sizeof(*frame)); frame->desc_ptr = copy_desc_ptr; frame->desc_size = desc_size; frame->id = id; return 0; } /** * uclogic_params_frame_init_v1() - initialize v1 tablet interface frame * controls. * * @frame: Pointer to the frame parameters to initialize (to be cleaned * up with uclogic_params_frame_cleanup()). Not modified in case * of error, or if parameters are not found. Cannot be NULL. * @pfound: Location for a flag which is set to true if the parameters * were found, and to false if not (e.g. device was * incompatible). Not modified in case of error. Cannot be NULL. * @hdev: The HID device of the tablet interface to initialize and get * parameters from. Cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_frame_init_v1(struct uclogic_params_frame *frame, bool *pfound, struct hid_device *hdev) { int rc; bool found = false; struct usb_device *usb_dev; char *str_buf = NULL; const size_t str_len = 16; /* Check arguments */ if (frame == NULL || pfound == NULL || hdev == NULL) { rc = -EINVAL; goto cleanup; } usb_dev = hid_to_usb_dev(hdev); /* * Enable generic button mode */ str_buf = kzalloc(str_len, GFP_KERNEL); if (str_buf == NULL) { rc = -ENOMEM; goto cleanup; } rc = usb_string(usb_dev, 123, str_buf, str_len); if (rc == -EPIPE) { hid_dbg(hdev, "generic button -enabling string descriptor not found\n"); } else if (rc < 0) { goto cleanup; } else if (strncmp(str_buf, "HK On", rc) != 0) { hid_dbg(hdev, "invalid response to enabling generic buttons: \"%s\"\n", str_buf); } else { hid_dbg(hdev, "generic buttons enabled\n"); rc = uclogic_params_frame_init_with_desc( frame, uclogic_rdesc_v1_frame_arr, uclogic_rdesc_v1_frame_size, UCLOGIC_RDESC_V1_FRAME_ID); if (rc != 0) goto cleanup; found = true; } *pfound = found; rc = 0; cleanup: kfree(str_buf); return rc; } /** * uclogic_params_cleanup_event_hooks - free resources used by the list of raw * event hooks. * Can be called repeatedly. * * @params: Input parameters to cleanup. Cannot be NULL. */ static void uclogic_params_cleanup_event_hooks(struct uclogic_params *params) { struct uclogic_raw_event_hook *curr, *n; if (!params || !params->event_hooks) return; list_for_each_entry_safe(curr, n, ¶ms->event_hooks->list, list) { cancel_work_sync(&curr->work); list_del(&curr->list); kfree(curr->event); kfree(curr); } kfree(params->event_hooks); params->event_hooks = NULL; } /** * uclogic_params_cleanup - free resources used by struct uclogic_params * (tablet interface's parameters). * Can be called repeatedly. * * @params: Input parameters to cleanup. Cannot be NULL. */ void uclogic_params_cleanup(struct uclogic_params *params) { if (!params->invalid) { size_t i; kfree(params->desc_ptr); uclogic_params_pen_cleanup(¶ms->pen); for (i = 0; i < ARRAY_SIZE(params->frame_list); i++) uclogic_params_frame_cleanup(¶ms->frame_list[i]); uclogic_params_cleanup_event_hooks(params); memset(params, 0, sizeof(*params)); } } /** * uclogic_params_get_desc() - Get a replacement report descriptor for a * tablet's interface. * * @params: The parameters of a tablet interface to get report * descriptor for. Cannot be NULL. * @pdesc: Location for the resulting, kmalloc-allocated report * descriptor pointer, or for NULL, if there's no replacement * report descriptor. Not modified in case of error. Cannot be * NULL. * @psize: Location for the resulting report descriptor size, not set if * there's no replacement report descriptor. Not modified in case * of error. Cannot be NULL. * * Returns: * Zero, if successful. * -EINVAL, if invalid arguments are supplied. * -ENOMEM, if failed to allocate memory. */ int uclogic_params_get_desc(const struct uclogic_params *params, const __u8 **pdesc, unsigned int *psize) { int rc = -ENOMEM; bool present = false; unsigned int size = 0; __u8 *desc = NULL; size_t i; /* Check arguments */ if (params == NULL || pdesc == NULL || psize == NULL) return -EINVAL; /* Concatenate descriptors */ #define ADD_DESC(_desc_ptr, _desc_size) \ do { \ unsigned int new_size; \ __u8 *new_desc; \ if ((_desc_ptr) == NULL) { \ break; \ } \ new_size = size + (_desc_size); \ new_desc = krealloc(desc, new_size, GFP_KERNEL); \ if (new_desc == NULL) { \ goto cleanup; \ } \ memcpy(new_desc + size, (_desc_ptr), (_desc_size)); \ desc = new_desc; \ size = new_size; \ present = true; \ } while (0) ADD_DESC(params->desc_ptr, params->desc_size); ADD_DESC(params->pen.desc_ptr, params->pen.desc_size); for (i = 0; i < ARRAY_SIZE(params->frame_list); i++) { ADD_DESC(params->frame_list[i].desc_ptr, params->frame_list[i].desc_size); } #undef ADD_DESC if (present) { *pdesc = desc; *psize = size; desc = NULL; } rc = 0; cleanup: kfree(desc); return rc; } /** * uclogic_params_init_invalid() - initialize tablet interface parameters, * specifying the interface is invalid. * * @params: Parameters to initialize (to be cleaned with * uclogic_params_cleanup()). Cannot be NULL. */ static void uclogic_params_init_invalid(struct uclogic_params *params) { params->invalid = true; } /** * uclogic_params_init_with_opt_desc() - initialize tablet interface * parameters with an optional replacement report descriptor. Only modify * report descriptor, if the original report descriptor matches the expected * size. * * @params: Parameters to initialize (to be cleaned with * uclogic_params_cleanup()). Not modified in case of * error. Cannot be NULL. * @hdev: The HID device of the tablet interface create the * parameters for. Cannot be NULL. * @orig_desc_size: Expected size of the original report descriptor to * be replaced. * @desc_ptr: Pointer to the replacement report descriptor. * Can be NULL, if desc_size is zero. * @desc_size: Size of the replacement report descriptor. * * Returns: * Zero, if successful. -EINVAL if an invalid argument was passed. * -ENOMEM, if failed to allocate memory. */ static int uclogic_params_init_with_opt_desc(struct uclogic_params *params, struct hid_device *hdev, unsigned int orig_desc_size, const __u8 *desc_ptr, unsigned int desc_size) { __u8 *desc_copy_ptr = NULL; unsigned int desc_copy_size; int rc; /* Check arguments */ if (params == NULL || hdev == NULL || (desc_ptr == NULL && desc_size != 0)) { rc = -EINVAL; goto cleanup; } /* Replace report descriptor, if it matches */ if (hdev->dev_rsize == orig_desc_size) { hid_dbg(hdev, "device report descriptor matches the expected size, replacing\n"); desc_copy_ptr = kmemdup(desc_ptr, desc_size, GFP_KERNEL); if (desc_copy_ptr == NULL) { rc = -ENOMEM; goto cleanup; } desc_copy_size = desc_size; } else { hid_dbg(hdev, "device report descriptor doesn't match the expected size (%u != %u), preserving\n", hdev->dev_rsize, orig_desc_size); desc_copy_ptr = NULL; desc_copy_size = 0; } /* Output parameters */ memset(params, 0, sizeof(*params)); params->desc_ptr = desc_copy_ptr; desc_copy_ptr = NULL; params->desc_size = desc_copy_size; rc = 0; cleanup: kfree(desc_copy_ptr); return rc; } /** * uclogic_params_huion_init() - initialize a Huion tablet interface and discover * its parameters. * * @params: Parameters to fill in (to be cleaned with * uclogic_params_cleanup()). Not modified in case of error. * Cannot be NULL. * @hdev: The HID device of the tablet interface to initialize and get * parameters from. Cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_huion_init(struct uclogic_params *params, struct hid_device *hdev) { int rc; struct usb_device *udev; struct usb_interface *iface; __u8 bInterfaceNumber; bool found; /* The resulting parameters (noop) */ struct uclogic_params p = {0, }; static const char transition_ver[] = "HUION_T153_160607"; char *ver_ptr = NULL; const size_t ver_len = sizeof(transition_ver) + 1; __u8 *params_ptr = NULL; size_t params_len = 0; /* Parameters string descriptor of a model with touch ring (HS610) */ static const __u8 touch_ring_model_params_buf[] = { 0x13, 0x03, 0x70, 0xC6, 0x00, 0x06, 0x7C, 0x00, 0xFF, 0x1F, 0xD8, 0x13, 0x03, 0x0D, 0x10, 0x01, 0x04, 0x3C, 0x3E }; /* Check arguments */ if (params == NULL || hdev == NULL) { rc = -EINVAL; goto cleanup; } udev = hid_to_usb_dev(hdev); iface = to_usb_interface(hdev->dev.parent); bInterfaceNumber = iface->cur_altsetting->desc.bInterfaceNumber; /* If it's a custom keyboard interface */ if (bInterfaceNumber == 1) { /* Keep everything intact, but mark pen usage invalid */ p.pen.usage_invalid = true; goto output; /* Else, if it's not a pen interface */ } else if (bInterfaceNumber != 0) { uclogic_params_init_invalid(&p); goto output; } /* Try to get firmware version */ ver_ptr = kzalloc(ver_len, GFP_KERNEL); if (ver_ptr == NULL) { rc = -ENOMEM; goto cleanup; } rc = usb_string(udev, 201, ver_ptr, ver_len); if (rc == -EPIPE) { *ver_ptr = '\0'; } else if (rc < 0) { hid_err(hdev, "failed retrieving Huion firmware version: %d\n", rc); goto cleanup; } /* The firmware is used in userspace as unique identifier */ strscpy(hdev->uniq, ver_ptr, sizeof(hdev->uniq)); /* If this is a transition firmware */ if (strcmp(ver_ptr, transition_ver) == 0) { hid_dbg(hdev, "transition firmware detected, not probing pen v2 parameters\n"); } else { /* Try to probe v2 pen parameters */ rc = uclogic_params_pen_init_v2(&p.pen, &found, ¶ms_ptr, ¶ms_len, hdev); if (rc != 0) { hid_err(hdev, "failed probing pen v2 parameters: %d\n", rc); goto cleanup; } else if (found) { hid_dbg(hdev, "pen v2 parameters found\n"); /* Create v2 frame button parameters */ rc = uclogic_params_frame_init_with_desc( &p.frame_list[0], uclogic_rdesc_v2_frame_buttons_arr, uclogic_rdesc_v2_frame_buttons_size, UCLOGIC_RDESC_V2_FRAME_BUTTONS_ID); if (rc != 0) { hid_err(hdev, "failed creating v2 frame button parameters: %d\n", rc); goto cleanup; } /* Link from pen sub-report */ p.pen.subreport_list[0].value = 0xe0; p.pen.subreport_list[0].id = UCLOGIC_RDESC_V2_FRAME_BUTTONS_ID; /* If this is the model with touch ring */ if (params_ptr != NULL && params_len == sizeof(touch_ring_model_params_buf) && memcmp(params_ptr, touch_ring_model_params_buf, params_len) == 0) { /* Create touch ring parameters */ rc = uclogic_params_frame_init_with_desc( &p.frame_list[1], uclogic_rdesc_v2_frame_touch_ring_arr, uclogic_rdesc_v2_frame_touch_ring_size, UCLOGIC_RDESC_V2_FRAME_TOUCH_ID); if (rc != 0) { hid_err(hdev, "failed creating v2 frame touch ring parameters: %d\n", rc); goto cleanup; } p.frame_list[1].suffix = "Touch Ring"; p.frame_list[1].dev_id_byte = UCLOGIC_RDESC_V2_FRAME_TOUCH_DEV_ID_BYTE; p.frame_list[1].touch_byte = 5; p.frame_list[1].touch_max = 12; p.frame_list[1].touch_flip_at = 7; } else { /* Create touch strip parameters */ rc = uclogic_params_frame_init_with_desc( &p.frame_list[1], uclogic_rdesc_v2_frame_touch_strip_arr, uclogic_rdesc_v2_frame_touch_strip_size, UCLOGIC_RDESC_V2_FRAME_TOUCH_ID); if (rc != 0) { hid_err(hdev, "failed creating v2 frame touch strip parameters: %d\n", rc); goto cleanup; } p.frame_list[1].suffix = "Touch Strip"; p.frame_list[1].dev_id_byte = UCLOGIC_RDESC_V2_FRAME_TOUCH_DEV_ID_BYTE; p.frame_list[1].touch_byte = 5; p.frame_list[1].touch_max = 8; } /* Link from pen sub-report */ p.pen.subreport_list[1].value = 0xf0; p.pen.subreport_list[1].id = UCLOGIC_RDESC_V2_FRAME_TOUCH_ID; /* Create v2 frame dial parameters */ rc = uclogic_params_frame_init_with_desc( &p.frame_list[2], uclogic_rdesc_v2_frame_dial_arr, uclogic_rdesc_v2_frame_dial_size, UCLOGIC_RDESC_V2_FRAME_DIAL_ID); if (rc != 0) { hid_err(hdev, "failed creating v2 frame dial parameters: %d\n", rc); goto cleanup; } p.frame_list[2].suffix = "Dial"; p.frame_list[2].dev_id_byte = UCLOGIC_RDESC_V2_FRAME_DIAL_DEV_ID_BYTE; p.frame_list[2].bitmap_dial_byte = 5; /* Link from pen sub-report */ p.pen.subreport_list[2].value = 0xf1; p.pen.subreport_list[2].id = UCLOGIC_RDESC_V2_FRAME_DIAL_ID; goto output; } hid_dbg(hdev, "pen v2 parameters not found\n"); } /* Try to probe v1 pen parameters */ rc = uclogic_params_pen_init_v1(&p.pen, &found, hdev); if (rc != 0) { hid_err(hdev, "failed probing pen v1 parameters: %d\n", rc); goto cleanup; } else if (found) { hid_dbg(hdev, "pen v1 parameters found\n"); /* Try to probe v1 frame */ rc = uclogic_params_frame_init_v1(&p.frame_list[0], &found, hdev); if (rc != 0) { hid_err(hdev, "v1 frame probing failed: %d\n", rc); goto cleanup; } hid_dbg(hdev, "frame v1 parameters%s found\n", (found ? "" : " not")); if (found) { /* Link frame button subreports from pen reports */ p.pen.subreport_list[0].value = 0xe0; p.pen.subreport_list[0].id = UCLOGIC_RDESC_V1_FRAME_ID; } goto output; } hid_dbg(hdev, "pen v1 parameters not found\n"); uclogic_params_init_invalid(&p); output: /* Output parameters */ memcpy(params, &p, sizeof(*params)); memset(&p, 0, sizeof(p)); rc = 0; cleanup: kfree(params_ptr); kfree(ver_ptr); uclogic_params_cleanup(&p); return rc; } /** * uclogic_probe_interface() - some tablets, like the Parblo A610 PLUS V2 or * the XP-PEN Deco Mini 7, need to be initialized by sending them magic data. * * @hdev: The HID device of the tablet interface to initialize and get * parameters from. Cannot be NULL. * @magic_arr: The magic data that should be sent to probe the interface. * Cannot be NULL. * @magic_size: Size of the magic data. * @endpoint: Endpoint where the magic data should be sent. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_probe_interface(struct hid_device *hdev, const u8 *magic_arr, size_t magic_size, int endpoint) { struct usb_device *udev; unsigned int pipe = 0; int sent; u8 *buf = NULL; int rc = 0; if (!hdev || !magic_arr) { rc = -EINVAL; goto cleanup; } buf = kmemdup(magic_arr, magic_size, GFP_KERNEL); if (!buf) { rc = -ENOMEM; goto cleanup; } udev = hid_to_usb_dev(hdev); pipe = usb_sndintpipe(udev, endpoint); rc = usb_interrupt_msg(udev, pipe, buf, magic_size, &sent, 1000); if (rc || sent != magic_size) { hid_err(hdev, "Interface probing failed: %d\n", rc); rc = -1; goto cleanup; } rc = 0; cleanup: kfree(buf); return rc; } /** * uclogic_params_parse_ugee_v2_desc - parse the string descriptor containing * pen and frame parameters returned by UGEE v2 devices. * * @str_desc: String descriptor, cannot be NULL. * @str_desc_size: Size of the string descriptor. * @desc_params: Output description params list. * @desc_params_size: Size of the output description params list. * @frame_type: Output frame type. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_parse_ugee_v2_desc(const __u8 *str_desc, size_t str_desc_size, s32 *desc_params, size_t desc_params_size, enum uclogic_params_frame_type *frame_type) { s32 pen_x_lm, pen_y_lm; s32 pen_x_pm, pen_y_pm; s32 pen_pressure_lm; s32 frame_num_buttons; s32 resolution; /* Minimum descriptor length required, maximum seen so far is 14 */ const int min_str_desc_size = 12; if (!str_desc || str_desc_size < min_str_desc_size) return -EINVAL; if (desc_params_size != UCLOGIC_RDESC_PH_ID_NUM) return -EINVAL; pen_x_lm = get_unaligned_le16(str_desc + 2); pen_y_lm = get_unaligned_le16(str_desc + 4); frame_num_buttons = str_desc[6]; *frame_type = str_desc[7]; pen_pressure_lm = get_unaligned_le16(str_desc + 8); resolution = get_unaligned_le16(str_desc + 10); if (resolution == 0) { pen_x_pm = 0; pen_y_pm = 0; } else { pen_x_pm = pen_x_lm * 1000 / resolution; pen_y_pm = pen_y_lm * 1000 / resolution; } desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_LM] = pen_x_lm; desc_params[UCLOGIC_RDESC_PEN_PH_ID_X_PM] = pen_x_pm; desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_LM] = pen_y_lm; desc_params[UCLOGIC_RDESC_PEN_PH_ID_Y_PM] = pen_y_pm; desc_params[UCLOGIC_RDESC_PEN_PH_ID_PRESSURE_LM] = pen_pressure_lm; desc_params[UCLOGIC_RDESC_FRAME_PH_ID_UM] = frame_num_buttons; return 0; } /** * uclogic_params_ugee_v2_init_frame_buttons() - initialize a UGEE v2 frame with * buttons. * @p: Parameters to fill in, cannot be NULL. * @desc_params: Device description params list. * @desc_params_size: Size of the description params list. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_ugee_v2_init_frame_buttons(struct uclogic_params *p, const s32 *desc_params, size_t desc_params_size) { __u8 *rdesc_frame = NULL; int rc = 0; if (!p || desc_params_size != UCLOGIC_RDESC_PH_ID_NUM) return -EINVAL; rdesc_frame = uclogic_rdesc_template_apply( uclogic_rdesc_ugee_v2_frame_btn_template_arr, uclogic_rdesc_ugee_v2_frame_btn_template_size, desc_params, UCLOGIC_RDESC_PH_ID_NUM); if (!rdesc_frame) return -ENOMEM; rc = uclogic_params_frame_init_with_desc(&p->frame_list[0], rdesc_frame, uclogic_rdesc_ugee_v2_frame_btn_template_size, UCLOGIC_RDESC_V1_FRAME_ID); kfree(rdesc_frame); return rc; } /** * uclogic_params_ugee_v2_init_frame_dial() - initialize a UGEE v2 frame with a * bitmap dial. * @p: Parameters to fill in, cannot be NULL. * @desc_params: Device description params list. * @desc_params_size: Size of the description params list. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_ugee_v2_init_frame_dial(struct uclogic_params *p, const s32 *desc_params, size_t desc_params_size) { __u8 *rdesc_frame = NULL; int rc = 0; if (!p || desc_params_size != UCLOGIC_RDESC_PH_ID_NUM) return -EINVAL; rdesc_frame = uclogic_rdesc_template_apply( uclogic_rdesc_ugee_v2_frame_dial_template_arr, uclogic_rdesc_ugee_v2_frame_dial_template_size, desc_params, UCLOGIC_RDESC_PH_ID_NUM); if (!rdesc_frame) return -ENOMEM; rc = uclogic_params_frame_init_with_desc(&p->frame_list[0], rdesc_frame, uclogic_rdesc_ugee_v2_frame_dial_template_size, UCLOGIC_RDESC_V1_FRAME_ID); kfree(rdesc_frame); if (rc) return rc; p->frame_list[0].bitmap_dial_byte = 7; return 0; } /** * uclogic_params_ugee_v2_init_frame_mouse() - initialize a UGEE v2 frame with a * mouse. * @p: Parameters to fill in, cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_ugee_v2_init_frame_mouse(struct uclogic_params *p) { int rc = 0; if (!p) return -EINVAL; rc = uclogic_params_frame_init_with_desc(&p->frame_list[1], uclogic_rdesc_ugee_v2_frame_mouse_template_arr, uclogic_rdesc_ugee_v2_frame_mouse_template_size, UCLOGIC_RDESC_V1_FRAME_ID); return rc; } /** * uclogic_params_ugee_v2_has_battery() - check whether a UGEE v2 device has * battery or not. * @hdev: The HID device of the tablet interface. * * Returns: * True if the device has battery, false otherwise. */ static bool uclogic_params_ugee_v2_has_battery(struct hid_device *hdev) { struct uclogic_drvdata *drvdata = hid_get_drvdata(hdev); if (drvdata->quirks & UCLOGIC_BATTERY_QUIRK) return true; /* The XP-PEN Deco LW vendor, product and version are identical to the * Deco L. The only difference reported by their firmware is the product * name. Add a quirk to support battery reporting on the wireless * version. */ if (hdev->vendor == USB_VENDOR_ID_UGEE && hdev->product == USB_DEVICE_ID_UGEE_XPPEN_TABLET_DECO_L) { struct usb_device *udev = hid_to_usb_dev(hdev); if (strstarts(udev->product, "Deco LW")) return true; } return false; } /** * uclogic_params_ugee_v2_init_battery() - initialize UGEE v2 battery reporting. * @hdev: The HID device of the tablet interface, cannot be NULL. * @p: Parameters to fill in, cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_ugee_v2_init_battery(struct hid_device *hdev, struct uclogic_params *p) { int rc = 0; if (!hdev || !p) return -EINVAL; /* Some tablets contain invalid characters in hdev->uniq, throwing a * "hwmon: '<name>' is not a valid name attribute, please fix" error. * Use the device vendor and product IDs instead. */ snprintf(hdev->uniq, sizeof(hdev->uniq), "%x-%x", hdev->vendor, hdev->product); rc = uclogic_params_frame_init_with_desc(&p->frame_list[1], uclogic_rdesc_ugee_v2_battery_template_arr, uclogic_rdesc_ugee_v2_battery_template_size, UCLOGIC_RDESC_UGEE_V2_BATTERY_ID); if (rc) return rc; p->frame_list[1].suffix = "Battery"; p->pen.subreport_list[1].value = 0xf2; p->pen.subreport_list[1].id = UCLOGIC_RDESC_UGEE_V2_BATTERY_ID; return rc; } /** * uclogic_params_ugee_v2_reconnect_work() - When a wireless tablet looses * connection to the USB dongle and reconnects, either because of its physical * distance or because it was switches off and on using the frame's switch, * uclogic_probe_interface() needs to be called again to enable the tablet. * * @work: The work that triggered this function. */ static void uclogic_params_ugee_v2_reconnect_work(struct work_struct *work) { struct uclogic_raw_event_hook *event_hook; event_hook = container_of(work, struct uclogic_raw_event_hook, work); uclogic_probe_interface(event_hook->hdev, uclogic_ugee_v2_probe_arr, uclogic_ugee_v2_probe_size, uclogic_ugee_v2_probe_endpoint); } /** * uclogic_params_ugee_v2_init_event_hooks() - initialize the list of events * to be hooked for UGEE v2 devices. * @hdev: The HID device of the tablet interface to initialize and get * parameters from. * @p: Parameters to fill in, cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_ugee_v2_init_event_hooks(struct hid_device *hdev, struct uclogic_params *p) { struct uclogic_raw_event_hook *event_hook; __u8 reconnect_event[] = { /* Event received on wireless tablet reconnection */ 0x02, 0xF8, 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; if (!p) return -EINVAL; /* The reconnection event is only received if the tablet has battery */ if (!uclogic_params_ugee_v2_has_battery(hdev)) return 0; p->event_hooks = kzalloc(sizeof(*p->event_hooks), GFP_KERNEL); if (!p->event_hooks) return -ENOMEM; INIT_LIST_HEAD(&p->event_hooks->list); event_hook = kzalloc(sizeof(*event_hook), GFP_KERNEL); if (!event_hook) return -ENOMEM; INIT_WORK(&event_hook->work, uclogic_params_ugee_v2_reconnect_work); event_hook->hdev = hdev; event_hook->size = ARRAY_SIZE(reconnect_event); event_hook->event = kmemdup(reconnect_event, event_hook->size, GFP_KERNEL); if (!event_hook->event) return -ENOMEM; list_add_tail(&event_hook->list, &p->event_hooks->list); return 0; } /** * uclogic_params_ugee_v2_init() - initialize a UGEE graphics tablets by * discovering their parameters. * * These tables, internally designed as v2 to differentiate them from older * models, expect a payload of magic data in orther to be switched to the fully * functional mode and expose their parameters in a similar way to the * information present in uclogic_params_pen_init_v1() but with some * differences. * * @params: Parameters to fill in (to be cleaned with * uclogic_params_cleanup()). Not modified in case of error. * Cannot be NULL. * @hdev: The HID device of the tablet interface to initialize and get * parameters from. Cannot be NULL. * * Returns: * Zero, if successful. A negative errno code on error. */ static int uclogic_params_ugee_v2_init(struct uclogic_params *params, struct hid_device *hdev) { int rc = 0; struct uclogic_drvdata *drvdata; struct usb_interface *iface; __u8 bInterfaceNumber; const int str_desc_len = 12; __u8 *str_desc = NULL; __u8 *rdesc_pen = NULL; s32 desc_params[UCLOGIC_RDESC_PH_ID_NUM]; enum uclogic_params_frame_type frame_type; /* The resulting parameters (noop) */ struct uclogic_params p = {0, }; if (!params || !hdev) { rc = -EINVAL; goto cleanup; } drvdata = hid_get_drvdata(hdev); iface = to_usb_interface(hdev->dev.parent); bInterfaceNumber = iface->cur_altsetting->desc.bInterfaceNumber; if (bInterfaceNumber == 0) { rc = uclogic_params_ugee_v2_init_frame_mouse(&p); if (rc) goto cleanup; goto output; } if (bInterfaceNumber != 2) { uclogic_params_init_invalid(&p); goto output; } /* * Initialize the interface by sending magic data. * The specific data was discovered by sniffing the Windows driver * traffic. */ rc = uclogic_probe_interface(hdev, uclogic_ugee_v2_probe_arr, uclogic_ugee_v2_probe_size, uclogic_ugee_v2_probe_endpoint); if (rc) { uclogic_params_init_invalid(&p); goto output; } /* * Read the string descriptor containing pen and frame parameters. * The specific string descriptor and data were discovered by sniffing * the Windows driver traffic. */ rc = uclogic_params_get_str_desc(&str_desc, hdev, 100, str_desc_len); if (rc != str_desc_len) { hid_err(hdev, "failed retrieving pen and frame parameters: %d\n", rc); uclogic_params_init_invalid(&p); goto output; } rc = uclogic_params_parse_ugee_v2_desc(str_desc, str_desc_len, desc_params, ARRAY_SIZE(desc_params), &frame_type); if (rc) goto cleanup; kfree(str_desc); str_desc = NULL; /* Initialize the pen interface */ rdesc_pen = uclogic_rdesc_template_apply( uclogic_rdesc_ugee_v2_pen_template_arr, uclogic_rdesc_ugee_v2_pen_template_size, desc_params, ARRAY_SIZE(desc_params)); if (!rdesc_pen) { rc = -ENOMEM; goto cleanup; } p.pen.desc_ptr = rdesc_pen; p.pen.desc_size = uclogic_rdesc_ugee_v2_pen_template_size; p.pen.id = 0x02; p.pen.subreport_list[0].value = 0xf0; p.pen.subreport_list[0].id = UCLOGIC_RDESC_V1_FRAME_ID; /* Initialize the frame interface */ if (drvdata->quirks & UCLOGIC_MOUSE_FRAME_QUIRK) frame_type = UCLOGIC_PARAMS_FRAME_MOUSE; switch (frame_type) { case UCLOGIC_PARAMS_FRAME_DIAL: case UCLOGIC_PARAMS_FRAME_MOUSE: rc = uclogic_params_ugee_v2_init_frame_dial(&p, desc_params, ARRAY_SIZE(desc_params)); break; case UCLOGIC_PARAMS_FRAME_BUTTONS: default: rc = uclogic_params_ugee_v2_init_frame_buttons(&p, desc_params, ARRAY_SIZE(desc_params)); break; } if (rc) goto cleanup; /* Initialize the battery interface*/ if (uclogic_params_ugee_v2_has_battery(hdev)) { rc = uclogic_params_ugee_v2_init_battery(hdev, &p); if (rc) { hid_err(hdev, "error initializing battery: %d\n", rc); goto cleanup; } } /* Create a list of raw events to be ignored */ rc = uclogic_params_ugee_v2_init_event_hooks(hdev, &p); if (rc) { hid_err(hdev, "error initializing event hook list: %d\n", rc); goto cleanup; } output: /* Output parameters */ memcpy(params, &p, sizeof(*params)); memset(&p, 0, sizeof(p)); rc = 0; cleanup: kfree(str_desc); uclogic_params_cleanup(&p); return rc; } /** * uclogic_params_init() - initialize a tablet interface and discover its * parameters. * * @params: Parameters to fill in (to be cleaned with * uclogic_params_cleanup()). Not modified in case of error. * Cannot be NULL. * @hdev: The HID device of the tablet interface to initialize and get * parameters from. Cannot be NULL. Must be using the USB low-level * driver, i.e. be an actual USB tablet. * * Returns: * Zero, if successful. A negative errno code on error. */ int uclogic_params_init(struct uclogic_params *params, struct hid_device *hdev) { int rc; struct usb_device *udev; __u8 bNumInterfaces; struct usb_interface *iface; __u8 bInterfaceNumber; bool found; /* The resulting parameters (noop) */ struct uclogic_params p = {0, }; /* Check arguments */ if (params == NULL || hdev == NULL || !hid_is_usb(hdev)) { rc = -EINVAL; goto cleanup; } udev = hid_to_usb_dev(hdev); bNumInterfaces = udev->config->desc.bNumInterfaces; iface = to_usb_interface(hdev->dev.parent); bInterfaceNumber = iface->cur_altsetting->desc.bInterfaceNumber; /* * Set replacement report descriptor if the original matches the * specified size. Otherwise keep interface unchanged. */ #define WITH_OPT_DESC(_orig_desc_token, _new_desc_token) \ uclogic_params_init_with_opt_desc( \ &p, hdev, \ UCLOGIC_RDESC_##_orig_desc_token##_SIZE, \ uclogic_rdesc_##_new_desc_token##_arr, \ uclogic_rdesc_##_new_desc_token##_size) #define VID_PID(_vid, _pid) \ (((__u32)(_vid) << 16) | ((__u32)(_pid) & U16_MAX)) /* * Handle specific interfaces for specific tablets. * * Observe the following logic: * * If the interface is recognized as producing certain useful input: * Mark interface as valid. * Output interface parameters. * Else, if the interface is recognized as *not* producing any useful * input: * Mark interface as invalid. * Else: * Mark interface as valid. * Output noop parameters. * * Rule of thumb: it is better to disable a broken interface than let * it spew garbage input. */ switch (VID_PID(hdev->vendor, hdev->product)) { case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_PF1209): rc = WITH_OPT_DESC(PF1209_ORIG, pf1209_fixed); if (rc != 0) goto cleanup; break; case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP4030U): rc = WITH_OPT_DESC(WPXXXXU_ORIG, wp4030u_fixed); if (rc != 0) goto cleanup; break; case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP5540U): if (hdev->dev_rsize == UCLOGIC_RDESC_WP5540U_V2_ORIG_SIZE) { if (bInterfaceNumber == 0) { /* Try to probe v1 pen parameters */ rc = uclogic_params_pen_init_v1(&p.pen, &found, hdev); if (rc != 0) { hid_err(hdev, "pen probing failed: %d\n", rc); goto cleanup; } if (!found) { hid_warn(hdev, "pen parameters not found"); } } else { uclogic_params_init_invalid(&p); } } else { rc = WITH_OPT_DESC(WPXXXXU_ORIG, wp5540u_fixed); if (rc != 0) goto cleanup; } break; case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP8060U): rc = WITH_OPT_DESC(WPXXXXU_ORIG, wp8060u_fixed); if (rc != 0) goto cleanup; break; case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_WP1062): rc = WITH_OPT_DESC(WP1062_ORIG, wp1062_fixed); if (rc != 0) goto cleanup; break; case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_WIRELESS_TABLET_TWHL850): switch (bInterfaceNumber) { case 0: rc = WITH_OPT_DESC(TWHL850_ORIG0, twhl850_fixed0); if (rc != 0) goto cleanup; break; case 1: rc = WITH_OPT_DESC(TWHL850_ORIG1, twhl850_fixed1); if (rc != 0) goto cleanup; break; case 2: rc = WITH_OPT_DESC(TWHL850_ORIG2, twhl850_fixed2); if (rc != 0) goto cleanup; break; } break; case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_TABLET_TWHA60): /* * If it is not a three-interface version, which is known to * respond to initialization. */ if (bNumInterfaces != 3) { switch (bInterfaceNumber) { case 0: rc = WITH_OPT_DESC(TWHA60_ORIG0, twha60_fixed0); if (rc != 0) goto cleanup; break; case 1: rc = WITH_OPT_DESC(TWHA60_ORIG1, twha60_fixed1); if (rc != 0) goto cleanup; break; } break; } fallthrough; case VID_PID(USB_VENDOR_ID_HUION, USB_DEVICE_ID_HUION_TABLET): case VID_PID(USB_VENDOR_ID_HUION, USB_DEVICE_ID_HUION_TABLET2): case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_HUION_TABLET): case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_YIYNOVA_TABLET): case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_UGEE_TABLET_81): case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_DRAWIMAGE_G3): case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_UGEE_TABLET_45): case VID_PID(USB_VENDOR_ID_UCLOGIC, USB_DEVICE_ID_UCLOGIC_UGEE_TABLET_47): rc = uclogic_params_huion_init(&p, hdev); if (rc != 0) goto cleanup; break; case VID_PID(USB_VENDOR_ID_UGTIZER, USB_DEVICE_ID_UGTIZER_TABLET_GP0610): case VID_PID(USB_VENDOR_ID_UGTIZER, USB_DEVICE_ID_UGTIZER_TABLET_GT5040): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_G540): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_G640): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_STAR06): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_TABLET_RAINBOW_CV720): /* If this is the pen interface */ if (bInterfaceNumber == 1) { /* Probe v1 pen parameters */ rc = uclogic_params_pen_init_v1(&p.pen, &found, hdev); if (rc != 0) { hid_err(hdev, "pen probing failed: %d\n", rc); goto cleanup; } if (!found) { hid_warn(hdev, "pen parameters not found"); uclogic_params_init_invalid(&p); } } else { uclogic_params_init_invalid(&p); } break; case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_DECO01): /* If this is the pen and frame interface */ if (bInterfaceNumber == 1) { /* Probe v1 pen parameters */ rc = uclogic_params_pen_init_v1(&p.pen, &found, hdev); if (rc != 0) { hid_err(hdev, "pen probing failed: %d\n", rc); goto cleanup; } /* Initialize frame parameters */ rc = uclogic_params_frame_init_with_desc( &p.frame_list[0], uclogic_rdesc_xppen_deco01_frame_arr, uclogic_rdesc_xppen_deco01_frame_size, 0); if (rc != 0) goto cleanup; } else { uclogic_params_init_invalid(&p); } break; case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_PARBLO_A610_PRO): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_DECO01_V2): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_DECO_L): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_DECO_PRO_MW): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_DECO_PRO_S): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_XPPEN_TABLET_DECO_PRO_SW): rc = uclogic_params_ugee_v2_init(&p, hdev); if (rc != 0) goto cleanup; break; case VID_PID(USB_VENDOR_ID_TRUST, USB_DEVICE_ID_TRUST_PANORA_TABLET): case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_TABLET_G5): /* Ignore non-pen interfaces */ if (bInterfaceNumber != 1) { uclogic_params_init_invalid(&p); break; } rc = uclogic_params_pen_init_v1(&p.pen, &found, hdev); if (rc != 0) { hid_err(hdev, "pen probing failed: %d\n", rc); goto cleanup; } else if (found) { rc = uclogic_params_frame_init_with_desc( &p.frame_list[0], uclogic_rdesc_ugee_g5_frame_arr, uclogic_rdesc_ugee_g5_frame_size, UCLOGIC_RDESC_UGEE_G5_FRAME_ID); if (rc != 0) { hid_err(hdev, "failed creating frame parameters: %d\n", rc); goto cleanup; } p.frame_list[0].re_lsb = UCLOGIC_RDESC_UGEE_G5_FRAME_RE_LSB; p.frame_list[0].dev_id_byte = UCLOGIC_RDESC_UGEE_G5_FRAME_DEV_ID_BYTE; } else { hid_warn(hdev, "pen parameters not found"); uclogic_params_init_invalid(&p); } break; case VID_PID(USB_VENDOR_ID_UGEE, USB_DEVICE_ID_UGEE_TABLET_EX07S): /* Ignore non-pen interfaces */ if (bInterfaceNumber != 1) { uclogic_params_init_invalid(&p); break; } rc = uclogic_params_pen_init_v1(&p.pen, &found, hdev); if (rc != 0) { hid_err(hdev, "pen probing failed: %d\n", rc); goto cleanup; } else if (found) { rc = uclogic_params_frame_init_with_desc( &p.frame_list[0], uclogic_rdesc_ugee_ex07_frame_arr, uclogic_rdesc_ugee_ex07_frame_size, 0); if (rc != 0) { hid_err(hdev, "failed creating frame parameters: %d\n", rc); goto cleanup; } } else { hid_warn(hdev, "pen parameters not found"); uclogic_params_init_invalid(&p); } break; } #undef VID_PID #undef WITH_OPT_DESC /* Output parameters */ memcpy(params, &p, sizeof(*params)); memset(&p, 0, sizeof(p)); rc = 0; cleanup: uclogic_params_cleanup(&p); return rc; } #ifdef CONFIG_HID_KUNIT_TEST #include "hid-uclogic-params-test.c" #endif |
| 1050 1046 1044 1048 1043 1044 95 260 189 189 4 104 104 104 104 104 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Pseudo-driver for the loopback interface. * * Version: @(#)loopback.c 1.0.4b 08/16/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@scyld.com> * * Alan Cox : Fixed oddments for NET3.014 * Alan Cox : Rejig for NET3.029 snap #3 * Alan Cox : Fixed NET3.029 bugs and sped up * Larry McVoy : Tiny tweak to double performance * Alan Cox : Backed out LMV's tweak - the linux mm * can't take it... * Michael Griffith: Don't bother computing the checksums * on packets received on the loopback * interface. * Alexey Kuznetsov: Potential hang under some extreme * cases removed. */ #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/errno.h> #include <linux/fcntl.h> #include <linux/in.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/ethtool.h> #include <net/sch_generic.h> #include <net/sock.h> #include <net/checksum.h> #include <linux/if_ether.h> /* For the statistics structure. */ #include <linux/if_arp.h> /* For ARPHRD_ETHER */ #include <linux/ip.h> #include <linux/tcp.h> #include <linux/percpu.h> #include <linux/net_tstamp.h> #include <net/net_namespace.h> #include <linux/u64_stats_sync.h> /* blackhole_netdev - a device used for dsts that are marked expired! * This is global device (instead of per-net-ns) since it's not needed * to be per-ns and gets initialized at boot time. */ struct net_device *blackhole_netdev; EXPORT_SYMBOL(blackhole_netdev); /* The higher levels take care of making this non-reentrant (it's * called with bh's disabled). */ static netdev_tx_t loopback_xmit(struct sk_buff *skb, struct net_device *dev) { int len; skb_tx_timestamp(skb); /* do not fool net_timestamp_check() with various clock bases */ skb_clear_tstamp(skb); skb_orphan(skb); /* Before queueing this packet to __netif_rx(), * make sure dst is refcounted. */ skb_dst_force(skb); skb->protocol = eth_type_trans(skb, dev); len = skb->len; if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) dev_lstats_add(dev, len); return NETDEV_TX_OK; } void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes) { int i; *packets = 0; *bytes = 0; for_each_possible_cpu(i) { const struct pcpu_lstats *lb_stats; u64 tbytes, tpackets; unsigned int start; lb_stats = per_cpu_ptr(dev->lstats, i); do { start = u64_stats_fetch_begin(&lb_stats->syncp); tpackets = u64_stats_read(&lb_stats->packets); tbytes = u64_stats_read(&lb_stats->bytes); } while (u64_stats_fetch_retry(&lb_stats->syncp, start)); *bytes += tbytes; *packets += tpackets; } } EXPORT_SYMBOL(dev_lstats_read); static void loopback_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { u64 packets, bytes; dev_lstats_read(dev, &packets, &bytes); stats->rx_packets = packets; stats->tx_packets = packets; stats->rx_bytes = bytes; stats->tx_bytes = bytes; } static u32 always_on(struct net_device *dev) { return 1; } static const struct ethtool_ops loopback_ethtool_ops = { .get_link = always_on, .get_ts_info = ethtool_op_get_ts_info, }; static int loopback_dev_init(struct net_device *dev) { netdev_lockdep_set_classes(dev); return 0; } static void loopback_dev_free(struct net_device *dev) { dev_net(dev)->loopback_dev = NULL; } static const struct net_device_ops loopback_ops = { .ndo_init = loopback_dev_init, .ndo_start_xmit = loopback_xmit, .ndo_get_stats64 = loopback_get_stats64, .ndo_set_mac_address = eth_mac_addr, }; static void gen_lo_setup(struct net_device *dev, unsigned int mtu, const struct ethtool_ops *eth_ops, const struct header_ops *hdr_ops, const struct net_device_ops *dev_ops, void (*dev_destructor)(struct net_device *dev)) { dev->mtu = mtu; dev->hard_header_len = ETH_HLEN; /* 14 */ dev->min_header_len = ETH_HLEN; /* 14 */ dev->addr_len = ETH_ALEN; /* 6 */ dev->type = ARPHRD_LOOPBACK; /* 0x0001*/ dev->flags = IFF_LOOPBACK; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; dev->lltx = true; dev->netns_local = true; netif_keep_dst(dev); dev->hw_features = NETIF_F_GSO_SOFTWARE; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | NETIF_F_HW_CSUM | NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED | NETIF_F_LOOPBACK; dev->ethtool_ops = eth_ops; dev->header_ops = hdr_ops; dev->netdev_ops = dev_ops; dev->needs_free_netdev = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS; dev->priv_destructor = dev_destructor; netif_set_tso_max_size(dev, GSO_MAX_SIZE); } /* The loopback device is special. There is only one instance * per network namespace. */ static void loopback_setup(struct net_device *dev) { gen_lo_setup(dev, (64 * 1024), &loopback_ethtool_ops, ð_header_ops, &loopback_ops, loopback_dev_free); } /* Setup and register the loopback device. */ static __net_init int loopback_net_init(struct net *net) { struct net_device *dev; int err; err = -ENOMEM; dev = alloc_netdev(0, "lo", NET_NAME_PREDICTABLE, loopback_setup); if (!dev) goto out; dev_net_set(dev, net); err = register_netdev(dev); if (err) goto out_free_netdev; BUG_ON(dev->ifindex != LOOPBACK_IFINDEX); net->loopback_dev = dev; return 0; out_free_netdev: free_netdev(dev); out: if (net_eq(net, &init_net)) panic("loopback: Failed to register netdevice: %d\n", err); return err; } /* Registered in net/core/dev.c */ struct pernet_operations __net_initdata loopback_net_ops = { .init = loopback_net_init, }; /* blackhole netdevice */ static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb, struct net_device *dev) { kfree_skb(skb); net_warn_ratelimited("%s(): Dropping skb.\n", __func__); return NETDEV_TX_OK; } static const struct net_device_ops blackhole_netdev_ops = { .ndo_start_xmit = blackhole_netdev_xmit, }; /* This is a dst-dummy device used specifically for invalidated * DSTs and unlike loopback, this is not per-ns. */ static void blackhole_netdev_setup(struct net_device *dev) { gen_lo_setup(dev, ETH_MIN_MTU, NULL, NULL, &blackhole_netdev_ops, NULL); } /* Setup and register the blackhole_netdev. */ static int __init blackhole_netdev_init(void) { blackhole_netdev = alloc_netdev(0, "blackhole_dev", NET_NAME_UNKNOWN, blackhole_netdev_setup); if (!blackhole_netdev) return -ENOMEM; rtnl_net_lock(&init_net); dev_init_scheduler(blackhole_netdev); dev_activate(blackhole_netdev); rtnl_net_unlock(&init_net); blackhole_netdev->flags |= IFF_UP | IFF_RUNNING; return 0; } device_initcall(blackhole_netdev_init); |
| 1 1 2 2 2 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 | // SPDX-License-Identifier: GPL-2.0-or-later /* * HID driver for the Prodikeys PC-MIDI Keyboard * providing midi & extra multimedia keys functionality * * Copyright (c) 2009 Don Prince <dhprince.devel@yahoo.co.uk> * * Controls for Octave Shift Up/Down, Channel, and * Sustain Duration available via sysfs. */ /* */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/device.h> #include <linux/module.h> #include <linux/usb.h> #include <linux/mutex.h> #include <linux/hid.h> #include <sound/core.h> #include <sound/initval.h> #include <sound/rawmidi.h> #include "hid-ids.h" #define pk_debug(format, arg...) \ pr_debug("hid-prodikeys: " format "\n" , ## arg) #define pk_error(format, arg...) \ pr_err("hid-prodikeys: " format "\n" , ## arg) struct pcmidi_snd; struct pcmidi_sustain { unsigned long in_use; struct pcmidi_snd *pm; struct timer_list timer; unsigned char status; unsigned char note; unsigned char velocity; }; #define PCMIDI_SUSTAINED_MAX 32 struct pcmidi_snd { struct hid_device *hdev; unsigned short ifnum; struct hid_report *pcmidi_report6; struct input_dev *input_ep82; unsigned short midi_mode; unsigned short midi_sustain_mode; unsigned short midi_sustain; unsigned short midi_channel; short midi_octave; struct pcmidi_sustain sustained_notes[PCMIDI_SUSTAINED_MAX]; unsigned short fn_state; unsigned short last_key[24]; spinlock_t rawmidi_in_lock; struct snd_card *card; struct snd_rawmidi *rwmidi; struct snd_rawmidi_substream *in_substream; unsigned long in_triggered; }; #define PK_QUIRK_NOGET 0x00010000 #define PCMIDI_MIDDLE_C 60 #define PCMIDI_CHANNEL_MIN 0 #define PCMIDI_CHANNEL_MAX 15 #define PCMIDI_OCTAVE_MIN (-2) #define PCMIDI_OCTAVE_MAX 2 #define PCMIDI_SUSTAIN_MIN 0 #define PCMIDI_SUSTAIN_MAX 5000 static const char shortname[] = "PC-MIDI"; static const char longname[] = "Prodikeys PC-MIDI Keyboard"; static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX; static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR; static bool enable[SNDRV_CARDS] = SNDRV_DEFAULT_ENABLE_PNP; module_param_array(index, int, NULL, 0444); module_param_array(id, charp, NULL, 0444); module_param_array(enable, bool, NULL, 0444); MODULE_PARM_DESC(index, "Index value for the PC-MIDI virtual audio driver"); MODULE_PARM_DESC(id, "ID string for the PC-MIDI virtual audio driver"); MODULE_PARM_DESC(enable, "Enable for the PC-MIDI virtual audio driver"); /* Output routine for the sysfs channel file */ static ssize_t show_channel(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); struct pcmidi_snd *pm = hid_get_drvdata(hdev); dbg_hid("pcmidi sysfs read channel=%u\n", pm->midi_channel); return sprintf(buf, "%u (min:%u, max:%u)\n", pm->midi_channel, PCMIDI_CHANNEL_MIN, PCMIDI_CHANNEL_MAX); } /* Input routine for the sysfs channel file */ static ssize_t store_channel(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct hid_device *hdev = to_hid_device(dev); struct pcmidi_snd *pm = hid_get_drvdata(hdev); unsigned channel = 0; if (sscanf(buf, "%u", &channel) > 0 && channel <= PCMIDI_CHANNEL_MAX) { dbg_hid("pcmidi sysfs write channel=%u\n", channel); pm->midi_channel = channel; return strlen(buf); } return -EINVAL; } static DEVICE_ATTR(channel, S_IRUGO | S_IWUSR | S_IWGRP , show_channel, store_channel); static struct device_attribute *sysfs_device_attr_channel = { &dev_attr_channel, }; /* Output routine for the sysfs sustain file */ static ssize_t show_sustain(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); struct pcmidi_snd *pm = hid_get_drvdata(hdev); dbg_hid("pcmidi sysfs read sustain=%u\n", pm->midi_sustain); return sprintf(buf, "%u (off:%u, max:%u (ms))\n", pm->midi_sustain, PCMIDI_SUSTAIN_MIN, PCMIDI_SUSTAIN_MAX); } /* Input routine for the sysfs sustain file */ static ssize_t store_sustain(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct hid_device *hdev = to_hid_device(dev); struct pcmidi_snd *pm = hid_get_drvdata(hdev); unsigned sustain = 0; if (sscanf(buf, "%u", &sustain) > 0 && sustain <= PCMIDI_SUSTAIN_MAX) { dbg_hid("pcmidi sysfs write sustain=%u\n", sustain); pm->midi_sustain = sustain; pm->midi_sustain_mode = (0 == sustain || !pm->midi_mode) ? 0 : 1; return strlen(buf); } return -EINVAL; } static DEVICE_ATTR(sustain, S_IRUGO | S_IWUSR | S_IWGRP, show_sustain, store_sustain); static struct device_attribute *sysfs_device_attr_sustain = { &dev_attr_sustain, }; /* Output routine for the sysfs octave file */ static ssize_t show_octave(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); struct pcmidi_snd *pm = hid_get_drvdata(hdev); dbg_hid("pcmidi sysfs read octave=%d\n", pm->midi_octave); return sprintf(buf, "%d (min:%d, max:%d)\n", pm->midi_octave, PCMIDI_OCTAVE_MIN, PCMIDI_OCTAVE_MAX); } /* Input routine for the sysfs octave file */ static ssize_t store_octave(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct hid_device *hdev = to_hid_device(dev); struct pcmidi_snd *pm = hid_get_drvdata(hdev); int octave = 0; if (sscanf(buf, "%d", &octave) > 0 && octave >= PCMIDI_OCTAVE_MIN && octave <= PCMIDI_OCTAVE_MAX) { dbg_hid("pcmidi sysfs write octave=%d\n", octave); pm->midi_octave = octave; return strlen(buf); } return -EINVAL; } static DEVICE_ATTR(octave, S_IRUGO | S_IWUSR | S_IWGRP, show_octave, store_octave); static struct device_attribute *sysfs_device_attr_octave = { &dev_attr_octave, }; static void pcmidi_send_note(struct pcmidi_snd *pm, unsigned char status, unsigned char note, unsigned char velocity) { unsigned long flags; unsigned char buffer[3]; buffer[0] = status; buffer[1] = note; buffer[2] = velocity; spin_lock_irqsave(&pm->rawmidi_in_lock, flags); if (!pm->in_substream) goto drop_note; if (!test_bit(pm->in_substream->number, &pm->in_triggered)) goto drop_note; snd_rawmidi_receive(pm->in_substream, buffer, 3); drop_note: spin_unlock_irqrestore(&pm->rawmidi_in_lock, flags); return; } static void pcmidi_sustained_note_release(struct timer_list *t) { struct pcmidi_sustain *pms = from_timer(pms, t, timer); pcmidi_send_note(pms->pm, pms->status, pms->note, pms->velocity); pms->in_use = 0; } static void init_sustain_timers(struct pcmidi_snd *pm) { struct pcmidi_sustain *pms; unsigned i; for (i = 0; i < PCMIDI_SUSTAINED_MAX; i++) { pms = &pm->sustained_notes[i]; pms->in_use = 0; pms->pm = pm; timer_setup(&pms->timer, pcmidi_sustained_note_release, 0); } } static void stop_sustain_timers(struct pcmidi_snd *pm) { struct pcmidi_sustain *pms; unsigned i; for (i = 0; i < PCMIDI_SUSTAINED_MAX; i++) { pms = &pm->sustained_notes[i]; pms->in_use = 1; del_timer_sync(&pms->timer); } } static int pcmidi_get_output_report(struct pcmidi_snd *pm) { struct hid_device *hdev = pm->hdev; struct hid_report *report; list_for_each_entry(report, &hdev->report_enum[HID_OUTPUT_REPORT].report_list, list) { if (!(6 == report->id)) continue; if (report->maxfield < 1) { hid_err(hdev, "output report is empty\n"); break; } if (report->field[0]->report_count != 2) { hid_err(hdev, "field count too low\n"); break; } pm->pcmidi_report6 = report; return 0; } /* should never get here */ return -ENODEV; } static void pcmidi_submit_output_report(struct pcmidi_snd *pm, int state) { struct hid_device *hdev = pm->hdev; struct hid_report *report = pm->pcmidi_report6; report->field[0]->value[0] = 0x01; report->field[0]->value[1] = state; hid_hw_request(hdev, report, HID_REQ_SET_REPORT); } static int pcmidi_handle_report1(struct pcmidi_snd *pm, u8 *data) { u32 bit_mask; bit_mask = data[1]; bit_mask = (bit_mask << 8) | data[2]; bit_mask = (bit_mask << 8) | data[3]; dbg_hid("pcmidi mode: %d\n", pm->midi_mode); /*KEY_MAIL or octave down*/ if (pm->midi_mode && bit_mask == 0x004000) { /* octave down */ pm->midi_octave--; if (pm->midi_octave < -2) pm->midi_octave = -2; dbg_hid("pcmidi mode: %d octave: %d\n", pm->midi_mode, pm->midi_octave); return 1; } /*KEY_WWW or sustain*/ else if (pm->midi_mode && bit_mask == 0x000004) { /* sustain on/off*/ pm->midi_sustain_mode ^= 0x1; return 1; } return 0; /* continue key processing */ } static int pcmidi_handle_report3(struct pcmidi_snd *pm, u8 *data, int size) { struct pcmidi_sustain *pms; unsigned i, j; unsigned char status, note, velocity; unsigned num_notes = (size-1)/2; for (j = 0; j < num_notes; j++) { note = data[j*2+1]; velocity = data[j*2+2]; if (note < 0x81) { /* note on */ status = 128 + 16 + pm->midi_channel; /* 1001nnnn */ note = note - 0x54 + PCMIDI_MIDDLE_C + (pm->midi_octave * 12); if (0 == velocity) velocity = 1; /* force note on */ } else { /* note off */ status = 128 + pm->midi_channel; /* 1000nnnn */ note = note - 0x94 + PCMIDI_MIDDLE_C + (pm->midi_octave*12); if (pm->midi_sustain_mode) { for (i = 0; i < PCMIDI_SUSTAINED_MAX; i++) { pms = &pm->sustained_notes[i]; if (!pms->in_use) { pms->status = status; pms->note = note; pms->velocity = velocity; pms->in_use = 1; mod_timer(&pms->timer, jiffies + msecs_to_jiffies(pm->midi_sustain)); return 1; } } } } pcmidi_send_note(pm, status, note, velocity); } return 1; } static int pcmidi_handle_report4(struct pcmidi_snd *pm, u8 *data) { unsigned key; u32 bit_mask; u32 bit_index; bit_mask = data[1]; bit_mask = (bit_mask << 8) | data[2]; bit_mask = (bit_mask << 8) | data[3]; /* break keys */ for (bit_index = 0; bit_index < 24; bit_index++) { if (!((0x01 << bit_index) & bit_mask)) { input_event(pm->input_ep82, EV_KEY, pm->last_key[bit_index], 0); pm->last_key[bit_index] = 0; } } /* make keys */ for (bit_index = 0; bit_index < 24; bit_index++) { key = 0; switch ((0x01 << bit_index) & bit_mask) { case 0x000010: /* Fn lock*/ pm->fn_state ^= 0x000010; if (pm->fn_state) pcmidi_submit_output_report(pm, 0xc5); else pcmidi_submit_output_report(pm, 0xc6); continue; case 0x020000: /* midi launcher..send a key (qwerty) or not? */ pcmidi_submit_output_report(pm, 0xc1); pm->midi_mode ^= 0x01; dbg_hid("pcmidi mode: %d\n", pm->midi_mode); continue; case 0x100000: /* KEY_MESSENGER or octave up */ dbg_hid("pcmidi mode: %d\n", pm->midi_mode); if (pm->midi_mode) { pm->midi_octave++; if (pm->midi_octave > 2) pm->midi_octave = 2; dbg_hid("pcmidi mode: %d octave: %d\n", pm->midi_mode, pm->midi_octave); continue; } else key = KEY_MESSENGER; break; case 0x400000: key = KEY_CALENDAR; break; case 0x080000: key = KEY_ADDRESSBOOK; break; case 0x040000: key = KEY_DOCUMENTS; break; case 0x800000: key = KEY_WORDPROCESSOR; break; case 0x200000: key = KEY_SPREADSHEET; break; case 0x010000: key = KEY_COFFEE; break; case 0x000100: key = KEY_HELP; break; case 0x000200: key = KEY_SEND; break; case 0x000400: key = KEY_REPLY; break; case 0x000800: key = KEY_FORWARDMAIL; break; case 0x001000: key = KEY_NEW; break; case 0x002000: key = KEY_OPEN; break; case 0x004000: key = KEY_CLOSE; break; case 0x008000: key = KEY_SAVE; break; case 0x000001: key = KEY_UNDO; break; case 0x000002: key = KEY_REDO; break; case 0x000004: key = KEY_SPELLCHECK; break; case 0x000008: key = KEY_PRINT; break; } if (key) { input_event(pm->input_ep82, EV_KEY, key, 1); pm->last_key[bit_index] = key; } } return 1; } static int pcmidi_handle_report( struct pcmidi_snd *pm, unsigned report_id, u8 *data, int size) { int ret = 0; switch (report_id) { case 0x01: /* midi keys (qwerty)*/ ret = pcmidi_handle_report1(pm, data); break; case 0x03: /* midi keyboard (musical)*/ ret = pcmidi_handle_report3(pm, data, size); break; case 0x04: /* multimedia/midi keys (qwerty)*/ ret = pcmidi_handle_report4(pm, data); break; } return ret; } static void pcmidi_setup_extra_keys( struct pcmidi_snd *pm, struct input_dev *input) { /* reassigned functionality for N/A keys MY PICTURES => KEY_WORDPROCESSOR MY MUSIC=> KEY_SPREADSHEET */ static const unsigned int keys[] = { KEY_FN, KEY_MESSENGER, KEY_CALENDAR, KEY_ADDRESSBOOK, KEY_DOCUMENTS, KEY_WORDPROCESSOR, KEY_SPREADSHEET, KEY_COFFEE, KEY_HELP, KEY_SEND, KEY_REPLY, KEY_FORWARDMAIL, KEY_NEW, KEY_OPEN, KEY_CLOSE, KEY_SAVE, KEY_UNDO, KEY_REDO, KEY_SPELLCHECK, KEY_PRINT, 0 }; const unsigned int *pkeys = &keys[0]; unsigned short i; if (pm->ifnum != 1) /* only set up ONCE for interace 1 */ return; pm->input_ep82 = input; for (i = 0; i < 24; i++) pm->last_key[i] = 0; while (*pkeys != 0) { set_bit(*pkeys, pm->input_ep82->keybit); ++pkeys; } } static int pcmidi_set_operational(struct pcmidi_snd *pm) { int rc; if (pm->ifnum != 1) return 0; /* only set up ONCE for interace 1 */ rc = pcmidi_get_output_report(pm); if (rc < 0) return rc; pcmidi_submit_output_report(pm, 0xc1); return 0; } static int pcmidi_snd_free(struct snd_device *dev) { return 0; } static int pcmidi_in_open(struct snd_rawmidi_substream *substream) { struct pcmidi_snd *pm = substream->rmidi->private_data; dbg_hid("pcmidi in open\n"); pm->in_substream = substream; return 0; } static int pcmidi_in_close(struct snd_rawmidi_substream *substream) { dbg_hid("pcmidi in close\n"); return 0; } static void pcmidi_in_trigger(struct snd_rawmidi_substream *substream, int up) { struct pcmidi_snd *pm = substream->rmidi->private_data; dbg_hid("pcmidi in trigger %d\n", up); pm->in_triggered = up; } static const struct snd_rawmidi_ops pcmidi_in_ops = { .open = pcmidi_in_open, .close = pcmidi_in_close, .trigger = pcmidi_in_trigger }; static int pcmidi_snd_initialise(struct pcmidi_snd *pm) { static int dev; struct snd_card *card; struct snd_rawmidi *rwmidi; int err; static struct snd_device_ops ops = { .dev_free = pcmidi_snd_free, }; if (pm->ifnum != 1) return 0; /* only set up midi device ONCE for interace 1 */ if (dev >= SNDRV_CARDS) return -ENODEV; if (!enable[dev]) { dev++; return -ENOENT; } /* Setup sound card */ err = snd_card_new(&pm->hdev->dev, index[dev], id[dev], THIS_MODULE, 0, &card); if (err < 0) { pk_error("failed to create pc-midi sound card\n"); err = -ENOMEM; goto fail; } pm->card = card; /* Setup sound device */ err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, pm, &ops); if (err < 0) { pk_error("failed to create pc-midi sound device: error %d\n", err); goto fail; } strscpy(card->driver, shortname, sizeof(card->driver)); strscpy(card->shortname, shortname, sizeof(card->shortname)); strscpy(card->longname, longname, sizeof(card->longname)); /* Set up rawmidi */ err = snd_rawmidi_new(card, card->shortname, 0, 0, 1, &rwmidi); if (err < 0) { pk_error("failed to create pc-midi rawmidi device: error %d\n", err); goto fail; } pm->rwmidi = rwmidi; strscpy(rwmidi->name, card->shortname, sizeof(rwmidi->name)); rwmidi->info_flags = SNDRV_RAWMIDI_INFO_INPUT; rwmidi->private_data = pm; snd_rawmidi_set_ops(rwmidi, SNDRV_RAWMIDI_STREAM_INPUT, &pcmidi_in_ops); /* create sysfs variables */ err = device_create_file(&pm->hdev->dev, sysfs_device_attr_channel); if (err < 0) { pk_error("failed to create sysfs attribute channel: error %d\n", err); goto fail; } err = device_create_file(&pm->hdev->dev, sysfs_device_attr_sustain); if (err < 0) { pk_error("failed to create sysfs attribute sustain: error %d\n", err); goto fail_attr_sustain; } err = device_create_file(&pm->hdev->dev, sysfs_device_attr_octave); if (err < 0) { pk_error("failed to create sysfs attribute octave: error %d\n", err); goto fail_attr_octave; } spin_lock_init(&pm->rawmidi_in_lock); init_sustain_timers(pm); err = pcmidi_set_operational(pm); if (err < 0) { pk_error("failed to find output report\n"); goto fail_register; } /* register it */ err = snd_card_register(card); if (err < 0) { pk_error("failed to register pc-midi sound card: error %d\n", err); goto fail_register; } dbg_hid("pcmidi_snd_initialise finished ok\n"); return 0; fail_register: stop_sustain_timers(pm); device_remove_file(&pm->hdev->dev, sysfs_device_attr_octave); fail_attr_octave: device_remove_file(&pm->hdev->dev, sysfs_device_attr_sustain); fail_attr_sustain: device_remove_file(&pm->hdev->dev, sysfs_device_attr_channel); fail: if (pm->card) { snd_card_free(pm->card); pm->card = NULL; } return err; } static int pcmidi_snd_terminate(struct pcmidi_snd *pm) { if (pm->card) { stop_sustain_timers(pm); device_remove_file(&pm->hdev->dev, sysfs_device_attr_channel); device_remove_file(&pm->hdev->dev, sysfs_device_attr_sustain); device_remove_file(&pm->hdev->dev, sysfs_device_attr_octave); snd_card_disconnect(pm->card); snd_card_free_when_closed(pm->card); } return 0; } /* * PC-MIDI report descriptor for report id is wrong. */ static const __u8 *pk_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { if (*rsize == 178 && rdesc[111] == 0x06 && rdesc[112] == 0x00 && rdesc[113] == 0xff) { hid_info(hdev, "fixing up pc-midi keyboard report descriptor\n"); rdesc[144] = 0x18; /* report 4: was 0x10 report count */ } return rdesc; } static int pk_input_mapping(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { struct pcmidi_snd *pm = hid_get_drvdata(hdev); if (HID_UP_MSVENDOR == (usage->hid & HID_USAGE_PAGE) && 1 == pm->ifnum) { pcmidi_setup_extra_keys(pm, hi->input); return 0; } return 0; } static int pk_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct pcmidi_snd *pm = hid_get_drvdata(hdev); int ret = 0; if (1 == pm->ifnum) { if (report->id == data[0]) switch (report->id) { case 0x01: /* midi keys (qwerty)*/ case 0x03: /* midi keyboard (musical)*/ case 0x04: /* extra/midi keys (qwerty)*/ ret = pcmidi_handle_report(pm, report->id, data, size); break; } } return ret; } static int pk_probe(struct hid_device *hdev, const struct hid_device_id *id) { int ret; struct usb_interface *intf; unsigned short ifnum; unsigned long quirks = id->driver_data; struct pcmidi_snd *pm; if (!hid_is_usb(hdev)) return -EINVAL; intf = to_usb_interface(hdev->dev.parent); ifnum = intf->cur_altsetting->desc.bInterfaceNumber; pm = kzalloc(sizeof(*pm), GFP_KERNEL); if (pm == NULL) { hid_err(hdev, "can't alloc descriptor\n"); return -ENOMEM; } pm->hdev = hdev; pm->ifnum = ifnum; hid_set_drvdata(hdev, pm); ret = hid_parse(hdev); if (ret) { hid_err(hdev, "hid parse failed\n"); goto err_free; } if (quirks & PK_QUIRK_NOGET) { /* hid_parse cleared all the quirks */ hdev->quirks |= HID_QUIRK_NOGET; } ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (ret) { hid_err(hdev, "hw start failed\n"); goto err_free; } ret = pcmidi_snd_initialise(pm); if (ret < 0) goto err_stop; return 0; err_stop: hid_hw_stop(hdev); err_free: kfree(pm); return ret; } static void pk_remove(struct hid_device *hdev) { struct pcmidi_snd *pm = hid_get_drvdata(hdev); pcmidi_snd_terminate(pm); hid_hw_stop(hdev); kfree(pm); } static const struct hid_device_id pk_devices[] = { {HID_USB_DEVICE(USB_VENDOR_ID_CREATIVELABS, USB_DEVICE_ID_PRODIKEYS_PCMIDI), .driver_data = PK_QUIRK_NOGET}, { } }; MODULE_DEVICE_TABLE(hid, pk_devices); static struct hid_driver pk_driver = { .name = "prodikeys", .id_table = pk_devices, .report_fixup = pk_report_fixup, .input_mapping = pk_input_mapping, .raw_event = pk_raw_event, .probe = pk_probe, .remove = pk_remove, }; module_hid_driver(pk_driver); MODULE_DESCRIPTION("HID driver for the Prodikeys PC-MIDI Keyboard"); MODULE_LICENSE("GPL"); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Manage send buffer * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #ifndef SMC_TX_H #define SMC_TX_H #include <linux/socket.h> #include <linux/types.h> #include "smc.h" #include "smc_cdc.h" static inline int smc_tx_prepared_sends(struct smc_connection *conn) { union smc_host_cursor sent, prep; smc_curs_copy(&sent, &conn->tx_curs_sent, conn); smc_curs_copy(&prep, &conn->tx_curs_prep, conn); return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); } void smc_tx_pending(struct smc_connection *conn); void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); void smc_tx_sndbuf_nonfull(struct smc_sock *smc); void smc_tx_consumer_update(struct smc_connection *conn, bool force); int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, u32 offset, int signal); #endif /* SMC_TX_H */ |
| 546 242 1 547 549 546 548 242 1 549 642 338 308 622 625 18 319 307 620 549 547 242 546 549 243 540 539 540 539 539 504 508 541 539 320 299 11 33 407 110 119 271 169 188 226 61 6 6 6 2 2 2 2 2 2 5 5 5 58 57 2 56 58 8 8 63 1 61 1 35 2 33 6 6 732 671 80 79 734 370 369 371 2 633 635 524 231 631 632 216 206 10 432 433 337 192 431 432 10 8 2 412 4 241 518 30 30 244 243 319 319 319 45 226 2 27 243 276 27 269 268 267 275 274 2 2 2 2 489 489 1 489 20 488 557 559 276 99 488 490 20 490 67 518 540 542 470 218 540 541 541 50 501 541 539 537 710 252 540 542 20 542 460 217 711 541 542 20 705 542 221 223 222 11 9 32 31 30 2 2 2 1 1 2 1 1 11 7 4 11 11 6 7 4 961 957 961 315 317 316 316 316 5 17 17 20 20 14 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_generic.c Generic packet scheduler routines. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Jamal Hadi Salim, <hadi@cyberus.ca> 990601 * - Ingress support */ #include <linux/bitops.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/if_vlan.h> #include <linux/skb_array.h> #include <linux/if_macvlan.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> #include <net/hotdata.h> #include <trace/events/qdisc.h> #include <trace/events/net.h> #include <net/xfrm.h> /* Qdisc to use by default */ const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops; EXPORT_SYMBOL(default_qdisc_ops); static void qdisc_maybe_clear_missed(struct Qdisc *q, const struct netdev_queue *txq) { clear_bit(__QDISC_STATE_MISSED, &q->state); /* Make sure the below netif_xmit_frozen_or_stopped() * checking happens after clearing STATE_MISSED. */ smp_mb__after_atomic(); /* Checking netif_xmit_frozen_or_stopped() again to * make sure STATE_MISSED is set if the STATE_MISSED * set by netif_tx_wake_queue()'s rescheduling of * net_tx_action() is cleared by the above clear_bit(). */ if (!netif_xmit_frozen_or_stopped(txq)) set_bit(__QDISC_STATE_MISSED, &q->state); else set_bit(__QDISC_STATE_DRAINING, &q->state); } /* Main transmission queue. */ /* Modifications to data participating in scheduling must be protected with * qdisc_lock(qdisc) spinlock. * * The idea is the following: * - enqueue, dequeue are serialized via qdisc root lock * - ingress filtering is also serialized via qdisc root lock * - updates to tree and tree walking are only done under the rtnl mutex. */ #define SKB_XOFF_MAGIC ((struct sk_buff *)1UL) static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) { const struct netdev_queue *txq = q->dev_queue; spinlock_t *lock = NULL; struct sk_buff *skb; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } skb = skb_peek(&q->skb_bad_txq); if (skb) { /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { skb = __skb_dequeue(&q->skb_bad_txq); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; } } else { skb = SKB_XOFF_MAGIC; qdisc_maybe_clear_missed(q, txq); } } if (lock) spin_unlock(lock); return skb; } static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q) { struct sk_buff *skb = skb_peek(&q->skb_bad_txq); if (unlikely(skb)) skb = __skb_dequeue_bad_txq(q); return skb; } static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, struct sk_buff *skb) { spinlock_t *lock = NULL; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } __skb_queue_tail(&q->skb_bad_txq, skb); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_inc(q, skb); qdisc_qstats_cpu_qlen_inc(q); } else { qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; } if (lock) spin_unlock(lock); } static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { spinlock_t *lock = NULL; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } while (skb) { struct sk_buff *next = skb->next; __skb_queue_tail(&q->gso_skb, skb); /* it's still part of the queue */ if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_requeues_inc(q); qdisc_qstats_cpu_backlog_inc(q, skb); qdisc_qstats_cpu_qlen_inc(q); } else { q->qstats.requeues++; qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; } skb = next; } if (lock) { spin_unlock(lock); set_bit(__QDISC_STATE_MISSED, &q->state); } else { __netif_schedule(q); } } static void try_bulk_dequeue_skb(struct Qdisc *q, struct sk_buff *skb, const struct netdev_queue *txq, int *packets) { int bytelimit = qdisc_avail_bulklimit(txq) - skb->len; while (bytelimit > 0) { struct sk_buff *nskb = q->dequeue(q); if (!nskb) break; bytelimit -= nskb->len; /* covers GSO len */ skb->next = nskb; skb = nskb; (*packets)++; /* GSO counts as one pkt */ } skb_mark_not_on_list(skb); } /* This variant of try_bulk_dequeue_skb() makes sure * all skbs in the chain are for the same txq */ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, struct sk_buff *skb, int *packets) { int mapping = skb_get_queue_mapping(skb); struct sk_buff *nskb; int cnt = 0; do { nskb = q->dequeue(q); if (!nskb) break; if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { qdisc_enqueue_skb_bad_txq(q, nskb); break; } skb->next = nskb; skb = nskb; } while (++cnt < 8); (*packets) += cnt; skb_mark_not_on_list(skb); } /* Note that dequeue_skb can possibly return a SKB list (via skb->next). * A requeued skb (via q->gso_skb) can also be a SKB list. */ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, int *packets) { const struct netdev_queue *txq = q->dev_queue; struct sk_buff *skb = NULL; *packets = 1; if (unlikely(!skb_queue_empty(&q->gso_skb))) { spinlock_t *lock = NULL; if (q->flags & TCQ_F_NOLOCK) { lock = qdisc_lock(q); spin_lock(lock); } skb = skb_peek(&q->gso_skb); /* skb may be null if another cpu pulls gso_skb off in between * empty check and lock. */ if (!skb) { if (lock) spin_unlock(lock); goto validate; } /* skb in gso_skb were already validated */ *validate = false; if (xfrm_offload(skb)) *validate = true; /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { skb = __skb_dequeue(&q->gso_skb); if (qdisc_is_percpu_stats(q)) { qdisc_qstats_cpu_backlog_dec(q, skb); qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; } } else { skb = NULL; qdisc_maybe_clear_missed(q, txq); } if (lock) spin_unlock(lock); goto trace; } validate: *validate = true; if ((q->flags & TCQ_F_ONETXQUEUE) && netif_xmit_frozen_or_stopped(txq)) { qdisc_maybe_clear_missed(q, txq); return skb; } skb = qdisc_dequeue_skb_bad_txq(q); if (unlikely(skb)) { if (skb == SKB_XOFF_MAGIC) return NULL; goto bulk; } skb = q->dequeue(q); if (skb) { bulk: if (qdisc_may_bulk(q)) try_bulk_dequeue_skb(q, skb, txq, packets); else try_bulk_dequeue_skb_slow(q, skb, packets); } trace: trace_qdisc_dequeue(q, txq, *packets, skb); return skb; } /* * Transmit possibly several skbs, and handle the return status as * required. Owning qdisc running bit guarantees that only one CPU * can execute this function. * * Returns to the caller: * false - hardware queue frozen backoff * true - feel free to send more pkts */ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate) { int ret = NETDEV_TX_BUSY; bool again = false; /* And release qdisc */ if (root_lock) spin_unlock(root_lock); /* Note that we validate skb (GSO, checksum, ...) outside of locks */ if (validate) skb = validate_xmit_skb_list(skb, dev, &again); #ifdef CONFIG_XFRM_OFFLOAD if (unlikely(again)) { if (root_lock) spin_lock(root_lock); dev_requeue_skb(skb, q); return false; } #endif if (likely(skb)) { HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) skb = dev_hard_start_xmit(skb, dev, txq, &ret); else qdisc_maybe_clear_missed(q, txq); HARD_TX_UNLOCK(dev, txq); } else { if (root_lock) spin_lock(root_lock); return true; } if (root_lock) spin_lock(root_lock); if (!dev_xmit_complete(ret)) { /* Driver returned NETDEV_TX_BUSY - requeue skb */ if (unlikely(ret != NETDEV_TX_BUSY)) net_warn_ratelimited("BUG %s code %d qlen %d\n", dev->name, ret, q->q.qlen); dev_requeue_skb(skb, q); return false; } return true; } /* * NOTE: Called under qdisc_lock(q) with locally disabled BH. * * running seqcount guarantees only one CPU can process * this qdisc at a time. qdisc_lock(q) serializes queue accesses for * this queue. * * netif_tx_lock serializes accesses to device driver. * * qdisc_lock(q) and netif_tx_lock are mutually exclusive, * if one is grabbed, another must be free. * * Note, that this procedure can be called by a watchdog timer * * Returns to the caller: * 0 - queue is empty or throttled. * >0 - queue is not empty. * */ static inline bool qdisc_restart(struct Qdisc *q, int *packets) { spinlock_t *root_lock = NULL; struct netdev_queue *txq; struct net_device *dev; struct sk_buff *skb; bool validate; /* Dequeue packet */ skb = dequeue_skb(q, &validate, packets); if (unlikely(!skb)) return false; if (!(q->flags & TCQ_F_NOLOCK)) root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); } void __qdisc_run(struct Qdisc *q) { int quota = READ_ONCE(net_hotdata.dev_tx_weight); int packets; while (qdisc_restart(q, &packets)) { quota -= packets; if (quota <= 0) { if (q->flags & TCQ_F_NOLOCK) set_bit(__QDISC_STATE_MISSED, &q->state); else __netif_schedule(q); break; } } } unsigned long dev_trans_start(struct net_device *dev) { unsigned long res = READ_ONCE(netdev_get_tx_queue(dev, 0)->trans_start); unsigned long val; unsigned int i; for (i = 1; i < dev->num_tx_queues; i++) { val = READ_ONCE(netdev_get_tx_queue(dev, i)->trans_start); if (val && time_after(val, res)) res = val; } return res; } EXPORT_SYMBOL(dev_trans_start); static void netif_freeze_queues(struct net_device *dev) { unsigned int i; int cpu; cpu = smp_processor_id(); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); /* We are the only thread of execution doing a * freeze, but we have to grab the _xmit_lock in * order to synchronize with threads which are in * the ->hard_start_xmit() handler and already * checked the frozen bit. */ __netif_tx_lock(txq, cpu); set_bit(__QUEUE_STATE_FROZEN, &txq->state); __netif_tx_unlock(txq); } } void netif_tx_lock(struct net_device *dev) { spin_lock(&dev->tx_global_lock); netif_freeze_queues(dev); } EXPORT_SYMBOL(netif_tx_lock); static void netif_unfreeze_queues(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); /* No need to grab the _xmit_lock here. If the * queue is not stopped for another reason, we * force a schedule. */ clear_bit(__QUEUE_STATE_FROZEN, &txq->state); netif_schedule_queue(txq); } } void netif_tx_unlock(struct net_device *dev) { netif_unfreeze_queues(dev); spin_unlock(&dev->tx_global_lock); } EXPORT_SYMBOL(netif_tx_unlock); static void dev_watchdog(struct timer_list *t) { struct net_device *dev = from_timer(dev, t, watchdog_timer); bool release = true; spin_lock(&dev->tx_global_lock); if (!qdisc_tx_is_noop(dev)) { if (netif_device_present(dev) && netif_running(dev) && netif_carrier_ok(dev)) { unsigned int timedout_ms = 0; unsigned int i; unsigned long trans_start; unsigned long oldest_start = jiffies; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq; txq = netdev_get_tx_queue(dev, i); if (!netif_xmit_stopped(txq)) continue; /* Paired with WRITE_ONCE() + smp_mb...() in * netdev_tx_sent_queue() and netif_tx_stop_queue(). */ smp_mb(); trans_start = READ_ONCE(txq->trans_start); if (time_after(jiffies, trans_start + dev->watchdog_timeo)) { timedout_ms = jiffies_to_msecs(jiffies - trans_start); atomic_long_inc(&txq->trans_timeout); break; } if (time_after(oldest_start, trans_start)) oldest_start = trans_start; } if (unlikely(timedout_ms)) { trace_net_dev_xmit_timeout(dev, i); netdev_crit(dev, "NETDEV WATCHDOG: CPU: %d: transmit queue %u timed out %u ms\n", raw_smp_processor_id(), i, timedout_ms); netif_freeze_queues(dev); dev->netdev_ops->ndo_tx_timeout(dev, i); netif_unfreeze_queues(dev); } if (!mod_timer(&dev->watchdog_timer, round_jiffies(oldest_start + dev->watchdog_timeo))) release = false; } } spin_unlock(&dev->tx_global_lock); if (release) netdev_put(dev, &dev->watchdog_dev_tracker); } void netdev_watchdog_up(struct net_device *dev) { if (!dev->netdev_ops->ndo_tx_timeout) return; if (dev->watchdog_timeo <= 0) dev->watchdog_timeo = 5*HZ; if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo))) netdev_hold(dev, &dev->watchdog_dev_tracker, GFP_ATOMIC); } EXPORT_SYMBOL_GPL(netdev_watchdog_up); static void netdev_watchdog_down(struct net_device *dev) { netif_tx_lock_bh(dev); if (del_timer(&dev->watchdog_timer)) netdev_put(dev, &dev->watchdog_dev_tracker); netif_tx_unlock_bh(dev); } /** * netif_carrier_on - set carrier * @dev: network device * * Device has detected acquisition of carrier. */ void netif_carrier_on(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; atomic_inc(&dev->carrier_up_count); linkwatch_fire_event(dev); if (netif_running(dev)) netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_carrier_on); /** * netif_carrier_off - clear carrier * @dev: network device * * Device has detected loss of carrier. */ void netif_carrier_off(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) { if (dev->reg_state == NETREG_UNINITIALIZED) return; atomic_inc(&dev->carrier_down_count); linkwatch_fire_event(dev); } } EXPORT_SYMBOL(netif_carrier_off); /** * netif_carrier_event - report carrier state event * @dev: network device * * Device has detected a carrier event but the carrier state wasn't changed. * Use in drivers when querying carrier state asynchronously, to avoid missing * events (link flaps) if link recovers before it's queried. */ void netif_carrier_event(struct net_device *dev) { if (dev->reg_state == NETREG_UNINITIALIZED) return; atomic_inc(&dev->carrier_up_count); atomic_inc(&dev->carrier_down_count); linkwatch_fire_event(dev); } EXPORT_SYMBOL_GPL(netif_carrier_event); /* "NOOP" scheduler: the best scheduler, recommended for all interfaces under all circumstances. It is difficult to invent anything faster or cheaper. */ static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { dev_core_stats_tx_dropped_inc(skb->dev); __qdisc_drop(skb, to_free); return NET_XMIT_CN; } static struct sk_buff *noop_dequeue(struct Qdisc *qdisc) { return NULL; } struct Qdisc_ops noop_qdisc_ops __read_mostly = { .id = "noop", .priv_size = 0, .enqueue = noop_enqueue, .dequeue = noop_dequeue, .peek = noop_dequeue, .owner = THIS_MODULE, }; static struct netdev_queue noop_netdev_queue = { RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc), RCU_POINTER_INITIALIZER(qdisc_sleeping, &noop_qdisc), }; struct Qdisc noop_qdisc = { .enqueue = noop_enqueue, .dequeue = noop_dequeue, .flags = TCQ_F_BUILTIN, .ops = &noop_qdisc_ops, .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), .gso_skb = { .next = (struct sk_buff *)&noop_qdisc.gso_skb, .prev = (struct sk_buff *)&noop_qdisc.gso_skb, .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock), }, .skb_bad_txq = { .next = (struct sk_buff *)&noop_qdisc.skb_bad_txq, .prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq, .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock), }, .owner = -1, }; EXPORT_SYMBOL(noop_qdisc); static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt, struct netlink_ext_ack *extack) { /* register_qdisc() assigns a default of noop_enqueue if unset, * but __dev_queue_xmit() treats noqueue only as such * if this is NULL - so clear it here. */ qdisc->enqueue = NULL; return 0; } struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { .id = "noqueue", .priv_size = 0, .init = noqueue_init, .enqueue = noop_enqueue, .dequeue = noop_dequeue, .peek = noop_dequeue, .owner = THIS_MODULE, }; const u8 sch_default_prio2band[TC_PRIO_MAX + 1] = { 1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }; EXPORT_SYMBOL(sch_default_prio2band); /* 3-band FIFO queue: old style, but should be a bit faster than generic prio+fifo combination. */ #define PFIFO_FAST_BANDS 3 /* * Private data for a pfifo_fast scheduler containing: * - rings for priority bands */ struct pfifo_fast_priv { struct skb_array q[PFIFO_FAST_BANDS]; }; static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, int band) { return &priv->q[band]; } static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { int band = sch_default_prio2band[skb->priority & TC_PRIO_MAX]; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct skb_array *q = band2list(priv, band); unsigned int pkt_len = qdisc_pkt_len(skb); int err; err = skb_array_produce(q, skb); if (unlikely(err)) { if (qdisc_is_percpu_stats(qdisc)) return qdisc_drop_cpu(skb, qdisc, to_free); else return qdisc_drop(skb, qdisc, to_free); } qdisc_update_stats_at_enqueue(qdisc, pkt_len); return NET_XMIT_SUCCESS; } static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct sk_buff *skb = NULL; bool need_retry = true; int band; retry: for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { struct skb_array *q = band2list(priv, band); if (__skb_array_empty(q)) continue; skb = __skb_array_consume(q); } if (likely(skb)) { qdisc_update_stats_at_dequeue(qdisc, skb); } else if (need_retry && READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY) { /* Delay clearing the STATE_MISSED here to reduce * the overhead of the second spin_trylock() in * qdisc_run_begin() and __netif_schedule() calling * in qdisc_run_end(). */ clear_bit(__QDISC_STATE_MISSED, &qdisc->state); clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); /* Make sure dequeuing happens after clearing * STATE_MISSED. */ smp_mb__after_atomic(); need_retry = false; goto retry; } return skb; } static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); struct sk_buff *skb = NULL; int band; for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { struct skb_array *q = band2list(priv, band); skb = __skb_array_peek(q); } return skb; } static void pfifo_fast_reset(struct Qdisc *qdisc) { int i, band; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); for (band = 0; band < PFIFO_FAST_BANDS; band++) { struct skb_array *q = band2list(priv, band); struct sk_buff *skb; /* NULL ring is possible if destroy path is due to a failed * skb_array_init() in pfifo_fast_init() case. */ if (!q->ring.queue) continue; while ((skb = __skb_array_consume(q)) != NULL) kfree_skb(skb); } if (qdisc_is_percpu_stats(qdisc)) { for_each_possible_cpu(i) { struct gnet_stats_queue *q; q = per_cpu_ptr(qdisc->cpu_qstats, i); q->backlog = 0; q->qlen = 0; } } } static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) { struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; memcpy(&opt.priomap, sch_default_prio2band, TC_PRIO_MAX + 1); if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: return -1; } static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt, struct netlink_ext_ack *extack) { unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); int prio; /* guard against zero length rings */ if (!qlen) return -EINVAL; for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { struct skb_array *q = band2list(priv, prio); int err; err = skb_array_init(q, qlen, GFP_KERNEL); if (err) return -ENOMEM; } /* Can by-pass the queue discipline */ qdisc->flags |= TCQ_F_CAN_BYPASS; return 0; } static void pfifo_fast_destroy(struct Qdisc *sch) { struct pfifo_fast_priv *priv = qdisc_priv(sch); int prio; for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { struct skb_array *q = band2list(priv, prio); /* NULL ring is possible if destroy path is due to a failed * skb_array_init() in pfifo_fast_init() case. */ if (!q->ring.queue) continue; /* Destroy ring but no need to kfree_skb because a call to * pfifo_fast_reset() has already done that work. */ ptr_ring_cleanup(&q->ring, NULL); } } static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch, unsigned int new_len) { struct pfifo_fast_priv *priv = qdisc_priv(sch); struct skb_array *bands[PFIFO_FAST_BANDS]; int prio; for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { struct skb_array *q = band2list(priv, prio); bands[prio] = q; } return skb_array_resize_multiple_bh(bands, PFIFO_FAST_BANDS, new_len, GFP_KERNEL); } struct Qdisc_ops pfifo_fast_ops __read_mostly = { .id = "pfifo_fast", .priv_size = sizeof(struct pfifo_fast_priv), .enqueue = pfifo_fast_enqueue, .dequeue = pfifo_fast_dequeue, .peek = pfifo_fast_peek, .init = pfifo_fast_init, .destroy = pfifo_fast_destroy, .reset = pfifo_fast_reset, .dump = pfifo_fast_dump, .change_tx_queue_len = pfifo_fast_change_tx_queue_len, .owner = THIS_MODULE, .static_flags = TCQ_F_NOLOCK | TCQ_F_CPUSTATS, }; EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack) { struct Qdisc *sch; unsigned int size = sizeof(*sch) + ops->priv_size; int err = -ENOBUFS; struct net_device *dev; if (!dev_queue) { NL_SET_ERR_MSG(extack, "No device queue given"); err = -EINVAL; goto errout; } dev = dev_queue->dev; sch = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue)); if (!sch) goto errout; __skb_queue_head_init(&sch->gso_skb); __skb_queue_head_init(&sch->skb_bad_txq); gnet_stats_basic_sync_init(&sch->bstats); lockdep_register_key(&sch->root_lock_key); spin_lock_init(&sch->q.lock); lockdep_set_class(&sch->q.lock, &sch->root_lock_key); if (ops->static_flags & TCQ_F_CPUSTATS) { sch->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!sch->cpu_bstats) goto errout1; sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); if (!sch->cpu_qstats) { free_percpu(sch->cpu_bstats); goto errout1; } } spin_lock_init(&sch->busylock); lockdep_set_class(&sch->busylock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); lockdep_set_class(&sch->seqlock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); sch->ops = ops; sch->flags = ops->static_flags; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; sch->owner = -1; netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL); refcount_set(&sch->refcnt, 1); return sch; errout1: lockdep_unregister_key(&sch->root_lock_key); kfree(sch); errout: return ERR_PTR(err); } struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, unsigned int parentid, struct netlink_ext_ack *extack) { struct Qdisc *sch; if (!try_module_get(ops->owner)) { NL_SET_ERR_MSG(extack, "Failed to increase module reference counter"); return NULL; } sch = qdisc_alloc(dev_queue, ops, extack); if (IS_ERR(sch)) { module_put(ops->owner); return NULL; } sch->parent = parentid; if (!ops->init || ops->init(sch, NULL, extack) == 0) { trace_qdisc_create(ops, dev_queue->dev, parentid); return sch; } qdisc_put(sch); return NULL; } EXPORT_SYMBOL(qdisc_create_dflt); /* Under qdisc_lock(qdisc) and BH! */ void qdisc_reset(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; trace_qdisc_reset(qdisc); if (ops->reset) ops->reset(qdisc); __skb_queue_purge(&qdisc->gso_skb); __skb_queue_purge(&qdisc->skb_bad_txq); qdisc->q.qlen = 0; qdisc->qstats.backlog = 0; } EXPORT_SYMBOL(qdisc_reset); void qdisc_free(struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) { free_percpu(qdisc->cpu_bstats); free_percpu(qdisc->cpu_qstats); } kfree(qdisc); } static void qdisc_free_cb(struct rcu_head *head) { struct Qdisc *q = container_of(head, struct Qdisc, rcu); qdisc_free(q); } static void __qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; struct net_device *dev = qdisc_dev(qdisc); #ifdef CONFIG_NET_SCHED qdisc_hash_del(qdisc); qdisc_put_stab(rtnl_dereference(qdisc->stab)); #endif gen_kill_estimator(&qdisc->rate_est); qdisc_reset(qdisc); if (ops->destroy) ops->destroy(qdisc); lockdep_unregister_key(&qdisc->root_lock_key); module_put(ops->owner); netdev_put(dev, &qdisc->dev_tracker); trace_qdisc_destroy(qdisc); call_rcu(&qdisc->rcu, qdisc_free_cb); } void qdisc_destroy(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return; __qdisc_destroy(qdisc); } void qdisc_put(struct Qdisc *qdisc) { if (!qdisc) return; if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_test(&qdisc->refcnt)) return; __qdisc_destroy(qdisc); } EXPORT_SYMBOL(qdisc_put); /* Version of qdisc_put() that is called with rtnl mutex unlocked. * Intended to be used as optimization, this function only takes rtnl lock if * qdisc reference counter reached zero. */ void qdisc_put_unlocked(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_rtnl_lock(&qdisc->refcnt)) return; __qdisc_destroy(qdisc); rtnl_unlock(); } EXPORT_SYMBOL(qdisc_put_unlocked); /* Attach toplevel qdisc to device queue. */ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, struct Qdisc *qdisc) { struct Qdisc *oqdisc = rtnl_dereference(dev_queue->qdisc_sleeping); spinlock_t *root_lock; root_lock = qdisc_lock(oqdisc); spin_lock_bh(root_lock); /* ... and graft new one */ if (qdisc == NULL) qdisc = &noop_qdisc; rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); spin_unlock_bh(root_lock); return oqdisc; } EXPORT_SYMBOL(dev_graft_qdisc); static void shutdown_scheduler_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_qdisc_default) { struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); struct Qdisc *qdisc_default = _qdisc_default; if (qdisc) { rcu_assign_pointer(dev_queue->qdisc, qdisc_default); rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc_default); qdisc_put(qdisc); } } static void attach_one_default_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_unused) { struct Qdisc *qdisc; const struct Qdisc_ops *ops = default_qdisc_ops; if (dev->priv_flags & IFF_NO_QUEUE) ops = &noqueue_qdisc_ops; else if(dev->type == ARPHRD_CAN) ops = &pfifo_fast_ops; qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL); if (!qdisc) return; if (!netif_is_multiqueue(dev)) qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); } static void attach_default_qdiscs(struct net_device *dev) { struct netdev_queue *txq; struct Qdisc *qdisc; txq = netdev_get_tx_queue(dev, 0); if (!netif_is_multiqueue(dev) || dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); qdisc = rtnl_dereference(txq->qdisc_sleeping); rcu_assign_pointer(dev->qdisc, qdisc); qdisc_refcount_inc(qdisc); } else { qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL); if (qdisc) { rcu_assign_pointer(dev->qdisc, qdisc); qdisc->ops->attach(qdisc); } } qdisc = rtnl_dereference(dev->qdisc); /* Detect default qdisc setup/init failed and fallback to "noqueue" */ if (qdisc == &noop_qdisc) { netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n", default_qdisc_ops->id, noqueue_qdisc_ops.id); netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); dev->priv_flags |= IFF_NO_QUEUE; netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); qdisc = rtnl_dereference(txq->qdisc_sleeping); rcu_assign_pointer(dev->qdisc, qdisc); qdisc_refcount_inc(qdisc); dev->priv_flags ^= IFF_NO_QUEUE; } #ifdef CONFIG_NET_SCHED if (qdisc != &noop_qdisc) qdisc_hash_add(qdisc, false); #endif } static void transition_one_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_need_watchdog) { struct Qdisc *new_qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); int *need_watchdog_p = _need_watchdog; if (!(new_qdisc->flags & TCQ_F_BUILTIN)) clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state); rcu_assign_pointer(dev_queue->qdisc, new_qdisc); if (need_watchdog_p) { WRITE_ONCE(dev_queue->trans_start, 0); *need_watchdog_p = 1; } } void dev_activate(struct net_device *dev) { int need_watchdog; /* No queueing discipline is attached to device; * create default one for devices, which need queueing * and noqueue_qdisc for virtual interfaces */ if (rtnl_dereference(dev->qdisc) == &noop_qdisc) attach_default_qdiscs(dev); if (!netif_carrier_ok(dev)) /* Delay activation until next carrier-on event */ return; need_watchdog = 0; netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog); if (dev_ingress_queue(dev)) transition_one_qdisc(dev, dev_ingress_queue(dev), NULL); if (need_watchdog) { netif_trans_update(dev); netdev_watchdog_up(dev); } } EXPORT_SYMBOL(dev_activate); static void qdisc_deactivate(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return; set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); } static void dev_deactivate_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_sync_needed) { bool *sync_needed = _sync_needed; struct Qdisc *qdisc; qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { if (qdisc->enqueue) *sync_needed = true; qdisc_deactivate(qdisc); rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); } } static void dev_reset_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_unused) { struct Qdisc *qdisc; bool nolock; qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); if (!qdisc) return; nolock = qdisc->flags & TCQ_F_NOLOCK; if (nolock) spin_lock_bh(&qdisc->seqlock); spin_lock_bh(qdisc_lock(qdisc)); qdisc_reset(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); if (nolock) { clear_bit(__QDISC_STATE_MISSED, &qdisc->state); clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); spin_unlock_bh(&qdisc->seqlock); } } static bool some_qdisc_is_busy(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *dev_queue; spinlock_t *root_lock; struct Qdisc *q; int val; dev_queue = netdev_get_tx_queue(dev, i); q = rtnl_dereference(dev_queue->qdisc_sleeping); root_lock = qdisc_lock(q); spin_lock_bh(root_lock); val = (qdisc_is_running(q) || test_bit(__QDISC_STATE_SCHED, &q->state)); spin_unlock_bh(root_lock); if (val) return true; } return false; } /** * dev_deactivate_many - deactivate transmissions on several devices * @head: list of devices to deactivate * * This function returns only when all outstanding transmissions * have completed, unless all devices are in dismantle phase. */ void dev_deactivate_many(struct list_head *head) { bool sync_needed = false; struct net_device *dev; list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, &sync_needed); if (dev_ingress_queue(dev)) dev_deactivate_queue(dev, dev_ingress_queue(dev), &sync_needed); netdev_watchdog_down(dev); } /* Wait for outstanding qdisc enqueuing calls. */ if (sync_needed) synchronize_net(); list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_reset_queue, NULL); if (dev_ingress_queue(dev)) dev_reset_queue(dev, dev_ingress_queue(dev), NULL); } /* Wait for outstanding qdisc_run calls. */ list_for_each_entry(dev, head, close_list) { while (some_qdisc_is_busy(dev)) { /* wait_event() would avoid this sleep-loop but would * require expensive checks in the fast paths of packet * processing which isn't worth it. */ schedule_timeout_uninterruptible(1); } } } void dev_deactivate(struct net_device *dev) { LIST_HEAD(single); list_add(&dev->close_list, &single); dev_deactivate_many(&single); list_del(&single); } EXPORT_SYMBOL(dev_deactivate); static int qdisc_change_tx_queue_len(struct net_device *dev, struct netdev_queue *dev_queue) { struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); const struct Qdisc_ops *ops = qdisc->ops; if (ops->change_tx_queue_len) return ops->change_tx_queue_len(qdisc, dev->tx_queue_len); return 0; } void dev_qdisc_change_real_num_tx(struct net_device *dev, unsigned int new_real_tx) { struct Qdisc *qdisc = rtnl_dereference(dev->qdisc); if (qdisc->ops->change_real_num_tx) qdisc->ops->change_real_num_tx(qdisc, new_real_tx); } void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx) { #ifdef CONFIG_NET_SCHED struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; unsigned int i; for (i = new_real_tx; i < dev->real_num_tx_queues; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping); /* Only update the default qdiscs we created, * qdiscs with handles are always hashed. */ if (qdisc != &noop_qdisc && !qdisc->handle) qdisc_hash_del(qdisc); } for (i = dev->real_num_tx_queues; i < new_real_tx; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping); if (qdisc != &noop_qdisc && !qdisc->handle) qdisc_hash_add(qdisc, false); } #endif } EXPORT_SYMBOL(mq_change_real_num_tx); int dev_qdisc_change_tx_queue_len(struct net_device *dev) { bool up = dev->flags & IFF_UP; unsigned int i; int ret = 0; if (up) dev_deactivate(dev); for (i = 0; i < dev->num_tx_queues; i++) { ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]); /* TODO: revert changes on a partial failure */ if (ret) break; } if (up) dev_activate(dev); return ret; } static void dev_init_scheduler_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_qdisc) { struct Qdisc *qdisc = _qdisc; rcu_assign_pointer(dev_queue->qdisc, qdisc); rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc); } void dev_init_scheduler(struct net_device *dev) { rcu_assign_pointer(dev->qdisc, &noop_qdisc); netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); timer_setup(&dev->watchdog_timer, dev_watchdog, 0); } void dev_shutdown(struct net_device *dev) { netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); qdisc_put(rtnl_dereference(dev->qdisc)); rcu_assign_pointer(dev->qdisc, &noop_qdisc); WARN_ON(timer_pending(&dev->watchdog_timer)); } /** * psched_ratecfg_precompute__() - Pre-compute values for reciprocal division * @rate: Rate to compute reciprocal division values of * @mult: Multiplier for reciprocal division * @shift: Shift for reciprocal division * * The multiplier and shift for reciprocal division by rate are stored * in mult and shift. * * The deal here is to replace a divide by a reciprocal one * in fast path (a reciprocal divide is a multiply and a shift) * * Normal formula would be : * time_in_ns = (NSEC_PER_SEC * len) / rate_bps * * We compute mult/shift to use instead : * time_in_ns = (len * mult) >> shift; * * We try to get the highest possible mult value for accuracy, * but have to make sure no overflows will ever happen. * * reciprocal_value() is not used here it doesn't handle 64-bit values. */ static void psched_ratecfg_precompute__(u64 rate, u32 *mult, u8 *shift) { u64 factor = NSEC_PER_SEC; *mult = 1; *shift = 0; if (rate <= 0) return; for (;;) { *mult = div64_u64(factor, rate); if (*mult & (1U << 31) || factor & (1ULL << 63)) break; factor <<= 1; (*shift)++; } } void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf, u64 rate64) { memset(r, 0, sizeof(*r)); r->overhead = conf->overhead; r->mpu = conf->mpu; r->rate_bytes_ps = max_t(u64, conf->rate, rate64); r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); psched_ratecfg_precompute__(r->rate_bytes_ps, &r->mult, &r->shift); } EXPORT_SYMBOL(psched_ratecfg_precompute); void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64) { r->rate_pkts_ps = pktrate64; psched_ratecfg_precompute__(r->rate_pkts_ps, &r->mult, &r->shift); } EXPORT_SYMBOL(psched_ppscfg_precompute); void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, struct tcf_proto *tp_head) { /* Protected with chain0->filter_chain_lock. * Can't access chain directly because tp_head can be NULL. */ struct mini_Qdisc *miniq_old = rcu_dereference_protected(*miniqp->p_miniq, 1); struct mini_Qdisc *miniq; if (!tp_head) { RCU_INIT_POINTER(*miniqp->p_miniq, NULL); } else { miniq = miniq_old != &miniqp->miniq1 ? &miniqp->miniq1 : &miniqp->miniq2; /* We need to make sure that readers won't see the miniq * we are about to modify. So ensure that at least one RCU * grace period has elapsed since the miniq was made * inactive. */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) cond_synchronize_rcu(miniq->rcu_state); else if (!poll_state_synchronize_rcu(miniq->rcu_state)) synchronize_rcu_expedited(); miniq->filter_list = tp_head; rcu_assign_pointer(*miniqp->p_miniq, miniq); } if (miniq_old) /* This is counterpart of the rcu sync above. We need to * block potential new user of miniq_old until all readers * are not seeing it. */ miniq_old->rcu_state = start_poll_synchronize_rcu(); } EXPORT_SYMBOL(mini_qdisc_pair_swap); void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp, struct tcf_block *block) { miniqp->miniq1.block = block; miniqp->miniq2.block = block; } EXPORT_SYMBOL(mini_qdisc_pair_block_init); void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, struct mini_Qdisc __rcu **p_miniq) { miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats; miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats; miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats; miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats; miniqp->miniq1.rcu_state = get_state_synchronize_rcu(); miniqp->miniq2.rcu_state = miniqp->miniq1.rcu_state; miniqp->p_miniq = p_miniq; } EXPORT_SYMBOL(mini_qdisc_pair_init); |
| 11 11 7 25 24 1 7 7 7 7 7 6 1 5 2 1 1 4 4 6 6 4 4 3 3 2 4 4 2 1 1 1 1 1 2 2 1 3 2 2 25 4 1 1 5 4 5 3 4 6 11 11 11 11 9 2 9 2 7 7 4 2 3 4 5 3 3 3 3 3 3 3 3 3 3 3 3 3 3 5 1 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 | // SPDX-License-Identifier: GPL-2.0-only /* * Framework for buffer objects that can be shared across devices/subsystems. * * Copyright(C) 2011 Linaro Limited. All rights reserved. * Author: Sumit Semwal <sumit.semwal@ti.com> * * Many thanks to linaro-mm-sig list, and specially * Arnd Bergmann <arnd@arndb.de>, Rob Clark <rob@ti.com> and * Daniel Vetter <daniel@ffwll.ch> for their support in creation and * refining of this idea. */ #include <linux/fs.h> #include <linux/slab.h> #include <linux/dma-buf.h> #include <linux/dma-fence.h> #include <linux/dma-fence-unwrap.h> #include <linux/anon_inodes.h> #include <linux/export.h> #include <linux/debugfs.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/sync_file.h> #include <linux/poll.h> #include <linux/dma-resv.h> #include <linux/mm.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <uapi/linux/dma-buf.h> #include <uapi/linux/magic.h> #include "dma-buf-sysfs-stats.h" static inline int is_dma_buf_file(struct file *); #if IS_ENABLED(CONFIG_DEBUG_FS) static DEFINE_MUTEX(debugfs_list_mutex); static LIST_HEAD(debugfs_list); static void __dma_buf_debugfs_list_add(struct dma_buf *dmabuf) { mutex_lock(&debugfs_list_mutex); list_add(&dmabuf->list_node, &debugfs_list); mutex_unlock(&debugfs_list_mutex); } static void __dma_buf_debugfs_list_del(struct dma_buf *dmabuf) { if (!dmabuf) return; mutex_lock(&debugfs_list_mutex); list_del(&dmabuf->list_node); mutex_unlock(&debugfs_list_mutex); } #else static void __dma_buf_debugfs_list_add(struct dma_buf *dmabuf) { } static void __dma_buf_debugfs_list_del(struct dma_buf *dmabuf) { } #endif static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen) { struct dma_buf *dmabuf; char name[DMA_BUF_NAME_LEN]; ssize_t ret = 0; dmabuf = dentry->d_fsdata; spin_lock(&dmabuf->name_lock); if (dmabuf->name) ret = strscpy(name, dmabuf->name, sizeof(name)); spin_unlock(&dmabuf->name_lock); return dynamic_dname(buffer, buflen, "/%s:%s", dentry->d_name.name, ret > 0 ? name : ""); } static void dma_buf_release(struct dentry *dentry) { struct dma_buf *dmabuf; dmabuf = dentry->d_fsdata; if (unlikely(!dmabuf)) return; BUG_ON(dmabuf->vmapping_counter); /* * If you hit this BUG() it could mean: * * There's a file reference imbalance in dma_buf_poll / dma_buf_poll_cb or somewhere else * * dmabuf->cb_in/out.active are non-0 despite no pending fence callback */ BUG_ON(dmabuf->cb_in.active || dmabuf->cb_out.active); dma_buf_stats_teardown(dmabuf); dmabuf->ops->release(dmabuf); if (dmabuf->resv == (struct dma_resv *)&dmabuf[1]) dma_resv_fini(dmabuf->resv); WARN_ON(!list_empty(&dmabuf->attachments)); module_put(dmabuf->owner); kfree(dmabuf->name); kfree(dmabuf); } static int dma_buf_file_release(struct inode *inode, struct file *file) { if (!is_dma_buf_file(file)) return -EINVAL; __dma_buf_debugfs_list_del(file->private_data); return 0; } static const struct dentry_operations dma_buf_dentry_ops = { .d_dname = dmabuffs_dname, .d_release = dma_buf_release, }; static struct vfsmount *dma_buf_mnt; static int dma_buf_fs_init_context(struct fs_context *fc) { struct pseudo_fs_context *ctx; ctx = init_pseudo(fc, DMA_BUF_MAGIC); if (!ctx) return -ENOMEM; ctx->dops = &dma_buf_dentry_ops; return 0; } static struct file_system_type dma_buf_fs_type = { .name = "dmabuf", .init_fs_context = dma_buf_fs_init_context, .kill_sb = kill_anon_super, }; static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma) { struct dma_buf *dmabuf; if (!is_dma_buf_file(file)) return -EINVAL; dmabuf = file->private_data; /* check if buffer supports mmap */ if (!dmabuf->ops->mmap) return -EINVAL; /* check for overflowing the buffer's size */ if (vma->vm_pgoff + vma_pages(vma) > dmabuf->size >> PAGE_SHIFT) return -EINVAL; return dmabuf->ops->mmap(dmabuf, vma); } static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence) { struct dma_buf *dmabuf; loff_t base; if (!is_dma_buf_file(file)) return -EBADF; dmabuf = file->private_data; /* only support discovering the end of the buffer, * but also allow SEEK_SET to maintain the idiomatic * SEEK_END(0), SEEK_CUR(0) pattern. */ if (whence == SEEK_END) base = dmabuf->size; else if (whence == SEEK_SET) base = 0; else return -EINVAL; if (offset != 0) return -EINVAL; return base + offset; } /** * DOC: implicit fence polling * * To support cross-device and cross-driver synchronization of buffer access * implicit fences (represented internally in the kernel with &struct dma_fence) * can be attached to a &dma_buf. The glue for that and a few related things are * provided in the &dma_resv structure. * * Userspace can query the state of these implicitly tracked fences using poll() * and related system calls: * * - Checking for EPOLLIN, i.e. read access, can be use to query the state of the * most recent write or exclusive fence. * * - Checking for EPOLLOUT, i.e. write access, can be used to query the state of * all attached fences, shared and exclusive ones. * * Note that this only signals the completion of the respective fences, i.e. the * DMA transfers are complete. Cache flushing and any other necessary * preparations before CPU access can begin still need to happen. * * As an alternative to poll(), the set of fences on DMA buffer can be * exported as a &sync_file using &dma_buf_sync_file_export. */ static void dma_buf_poll_cb(struct dma_fence *fence, struct dma_fence_cb *cb) { struct dma_buf_poll_cb_t *dcb = (struct dma_buf_poll_cb_t *)cb; struct dma_buf *dmabuf = container_of(dcb->poll, struct dma_buf, poll); unsigned long flags; spin_lock_irqsave(&dcb->poll->lock, flags); wake_up_locked_poll(dcb->poll, dcb->active); dcb->active = 0; spin_unlock_irqrestore(&dcb->poll->lock, flags); dma_fence_put(fence); /* Paired with get_file in dma_buf_poll */ fput(dmabuf->file); } static bool dma_buf_poll_add_cb(struct dma_resv *resv, bool write, struct dma_buf_poll_cb_t *dcb) { struct dma_resv_iter cursor; struct dma_fence *fence; int r; dma_resv_for_each_fence(&cursor, resv, dma_resv_usage_rw(write), fence) { dma_fence_get(fence); r = dma_fence_add_callback(fence, &dcb->cb, dma_buf_poll_cb); if (!r) return true; dma_fence_put(fence); } return false; } static __poll_t dma_buf_poll(struct file *file, poll_table *poll) { struct dma_buf *dmabuf; struct dma_resv *resv; __poll_t events; dmabuf = file->private_data; if (!dmabuf || !dmabuf->resv) return EPOLLERR; resv = dmabuf->resv; poll_wait(file, &dmabuf->poll, poll); events = poll_requested_events(poll) & (EPOLLIN | EPOLLOUT); if (!events) return 0; dma_resv_lock(resv, NULL); if (events & EPOLLOUT) { struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_out; /* Check that callback isn't busy */ spin_lock_irq(&dmabuf->poll.lock); if (dcb->active) events &= ~EPOLLOUT; else dcb->active = EPOLLOUT; spin_unlock_irq(&dmabuf->poll.lock); if (events & EPOLLOUT) { /* Paired with fput in dma_buf_poll_cb */ get_file(dmabuf->file); if (!dma_buf_poll_add_cb(resv, true, dcb)) /* No callback queued, wake up any other waiters */ dma_buf_poll_cb(NULL, &dcb->cb); else events &= ~EPOLLOUT; } } if (events & EPOLLIN) { struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_in; /* Check that callback isn't busy */ spin_lock_irq(&dmabuf->poll.lock); if (dcb->active) events &= ~EPOLLIN; else dcb->active = EPOLLIN; spin_unlock_irq(&dmabuf->poll.lock); if (events & EPOLLIN) { /* Paired with fput in dma_buf_poll_cb */ get_file(dmabuf->file); if (!dma_buf_poll_add_cb(resv, false, dcb)) /* No callback queued, wake up any other waiters */ dma_buf_poll_cb(NULL, &dcb->cb); else events &= ~EPOLLIN; } } dma_resv_unlock(resv); return events; } /** * dma_buf_set_name - Set a name to a specific dma_buf to track the usage. * It could support changing the name of the dma-buf if the same * piece of memory is used for multiple purpose between different devices. * * @dmabuf: [in] dmabuf buffer that will be renamed. * @buf: [in] A piece of userspace memory that contains the name of * the dma-buf. * * Returns 0 on success. If the dma-buf buffer is already attached to * devices, return -EBUSY. * */ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf) { char *name = strndup_user(buf, DMA_BUF_NAME_LEN); if (IS_ERR(name)) return PTR_ERR(name); spin_lock(&dmabuf->name_lock); kfree(dmabuf->name); dmabuf->name = name; spin_unlock(&dmabuf->name_lock); return 0; } #if IS_ENABLED(CONFIG_SYNC_FILE) static long dma_buf_export_sync_file(struct dma_buf *dmabuf, void __user *user_data) { struct dma_buf_export_sync_file arg; enum dma_resv_usage usage; struct dma_fence *fence = NULL; struct sync_file *sync_file; int fd, ret; if (copy_from_user(&arg, user_data, sizeof(arg))) return -EFAULT; if (arg.flags & ~DMA_BUF_SYNC_RW) return -EINVAL; if ((arg.flags & DMA_BUF_SYNC_RW) == 0) return -EINVAL; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) return fd; usage = dma_resv_usage_rw(arg.flags & DMA_BUF_SYNC_WRITE); ret = dma_resv_get_singleton(dmabuf->resv, usage, &fence); if (ret) goto err_put_fd; if (!fence) fence = dma_fence_get_stub(); sync_file = sync_file_create(fence); dma_fence_put(fence); if (!sync_file) { ret = -ENOMEM; goto err_put_fd; } arg.fd = fd; if (copy_to_user(user_data, &arg, sizeof(arg))) { ret = -EFAULT; goto err_put_file; } fd_install(fd, sync_file->file); return 0; err_put_file: fput(sync_file->file); err_put_fd: put_unused_fd(fd); return ret; } static long dma_buf_import_sync_file(struct dma_buf *dmabuf, const void __user *user_data) { struct dma_buf_import_sync_file arg; struct dma_fence *fence, *f; enum dma_resv_usage usage; struct dma_fence_unwrap iter; unsigned int num_fences; int ret = 0; if (copy_from_user(&arg, user_data, sizeof(arg))) return -EFAULT; if (arg.flags & ~DMA_BUF_SYNC_RW) return -EINVAL; if ((arg.flags & DMA_BUF_SYNC_RW) == 0) return -EINVAL; fence = sync_file_get_fence(arg.fd); if (!fence) return -EINVAL; usage = (arg.flags & DMA_BUF_SYNC_WRITE) ? DMA_RESV_USAGE_WRITE : DMA_RESV_USAGE_READ; num_fences = 0; dma_fence_unwrap_for_each(f, &iter, fence) ++num_fences; if (num_fences > 0) { dma_resv_lock(dmabuf->resv, NULL); ret = dma_resv_reserve_fences(dmabuf->resv, num_fences); if (!ret) { dma_fence_unwrap_for_each(f, &iter, fence) dma_resv_add_fence(dmabuf->resv, f, usage); } dma_resv_unlock(dmabuf->resv); } dma_fence_put(fence); return ret; } #endif static long dma_buf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct dma_buf *dmabuf; struct dma_buf_sync sync; enum dma_data_direction direction; int ret; dmabuf = file->private_data; switch (cmd) { case DMA_BUF_IOCTL_SYNC: if (copy_from_user(&sync, (void __user *) arg, sizeof(sync))) return -EFAULT; if (sync.flags & ~DMA_BUF_SYNC_VALID_FLAGS_MASK) return -EINVAL; switch (sync.flags & DMA_BUF_SYNC_RW) { case DMA_BUF_SYNC_READ: direction = DMA_FROM_DEVICE; break; case DMA_BUF_SYNC_WRITE: direction = DMA_TO_DEVICE; break; case DMA_BUF_SYNC_RW: direction = DMA_BIDIRECTIONAL; break; default: return -EINVAL; } if (sync.flags & DMA_BUF_SYNC_END) ret = dma_buf_end_cpu_access(dmabuf, direction); else ret = dma_buf_begin_cpu_access(dmabuf, direction); return ret; case DMA_BUF_SET_NAME_A: case DMA_BUF_SET_NAME_B: return dma_buf_set_name(dmabuf, (const char __user *)arg); #if IS_ENABLED(CONFIG_SYNC_FILE) case DMA_BUF_IOCTL_EXPORT_SYNC_FILE: return dma_buf_export_sync_file(dmabuf, (void __user *)arg); case DMA_BUF_IOCTL_IMPORT_SYNC_FILE: return dma_buf_import_sync_file(dmabuf, (const void __user *)arg); #endif default: return -ENOTTY; } } static void dma_buf_show_fdinfo(struct seq_file *m, struct file *file) { struct dma_buf *dmabuf = file->private_data; seq_printf(m, "size:\t%zu\n", dmabuf->size); /* Don't count the temporary reference taken inside procfs seq_show */ seq_printf(m, "count:\t%ld\n", file_count(dmabuf->file) - 1); seq_printf(m, "exp_name:\t%s\n", dmabuf->exp_name); spin_lock(&dmabuf->name_lock); if (dmabuf->name) seq_printf(m, "name:\t%s\n", dmabuf->name); spin_unlock(&dmabuf->name_lock); } static const struct file_operations dma_buf_fops = { .release = dma_buf_file_release, .mmap = dma_buf_mmap_internal, .llseek = dma_buf_llseek, .poll = dma_buf_poll, .unlocked_ioctl = dma_buf_ioctl, .compat_ioctl = compat_ptr_ioctl, .show_fdinfo = dma_buf_show_fdinfo, }; /* * is_dma_buf_file - Check if struct file* is associated with dma_buf */ static inline int is_dma_buf_file(struct file *file) { return file->f_op == &dma_buf_fops; } static struct file *dma_buf_getfile(size_t size, int flags) { static atomic64_t dmabuf_inode = ATOMIC64_INIT(0); struct inode *inode = alloc_anon_inode(dma_buf_mnt->mnt_sb); struct file *file; if (IS_ERR(inode)) return ERR_CAST(inode); inode->i_size = size; inode_set_bytes(inode, size); /* * The ->i_ino acquired from get_next_ino() is not unique thus * not suitable for using it as dentry name by dmabuf stats. * Override ->i_ino with the unique and dmabuffs specific * value. */ inode->i_ino = atomic64_inc_return(&dmabuf_inode); flags &= O_ACCMODE | O_NONBLOCK; file = alloc_file_pseudo(inode, dma_buf_mnt, "dmabuf", flags, &dma_buf_fops); if (IS_ERR(file)) goto err_alloc_file; return file; err_alloc_file: iput(inode); return file; } /** * DOC: dma buf device access * * For device DMA access to a shared DMA buffer the usual sequence of operations * is fairly simple: * * 1. The exporter defines his exporter instance using * DEFINE_DMA_BUF_EXPORT_INFO() and calls dma_buf_export() to wrap a private * buffer object into a &dma_buf. It then exports that &dma_buf to userspace * as a file descriptor by calling dma_buf_fd(). * * 2. Userspace passes this file-descriptors to all drivers it wants this buffer * to share with: First the file descriptor is converted to a &dma_buf using * dma_buf_get(). Then the buffer is attached to the device using * dma_buf_attach(). * * Up to this stage the exporter is still free to migrate or reallocate the * backing storage. * * 3. Once the buffer is attached to all devices userspace can initiate DMA * access to the shared buffer. In the kernel this is done by calling * dma_buf_map_attachment() and dma_buf_unmap_attachment(). * * 4. Once a driver is done with a shared buffer it needs to call * dma_buf_detach() (after cleaning up any mappings) and then release the * reference acquired with dma_buf_get() by calling dma_buf_put(). * * For the detailed semantics exporters are expected to implement see * &dma_buf_ops. */ /** * dma_buf_export - Creates a new dma_buf, and associates an anon file * with this buffer, so it can be exported. * Also connect the allocator specific data and ops to the buffer. * Additionally, provide a name string for exporter; useful in debugging. * * @exp_info: [in] holds all the export related information provided * by the exporter. see &struct dma_buf_export_info * for further details. * * Returns, on success, a newly created struct dma_buf object, which wraps the * supplied private data and operations for struct dma_buf_ops. On either * missing ops, or error in allocating struct dma_buf, will return negative * error. * * For most cases the easiest way to create @exp_info is through the * %DEFINE_DMA_BUF_EXPORT_INFO macro. */ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info) { struct dma_buf *dmabuf; struct dma_resv *resv = exp_info->resv; struct file *file; size_t alloc_size = sizeof(struct dma_buf); int ret; if (WARN_ON(!exp_info->priv || !exp_info->ops || !exp_info->ops->map_dma_buf || !exp_info->ops->unmap_dma_buf || !exp_info->ops->release)) return ERR_PTR(-EINVAL); if (WARN_ON(exp_info->ops->cache_sgt_mapping && (exp_info->ops->pin || exp_info->ops->unpin))) return ERR_PTR(-EINVAL); if (WARN_ON(!exp_info->ops->pin != !exp_info->ops->unpin)) return ERR_PTR(-EINVAL); if (!try_module_get(exp_info->owner)) return ERR_PTR(-ENOENT); file = dma_buf_getfile(exp_info->size, exp_info->flags); if (IS_ERR(file)) { ret = PTR_ERR(file); goto err_module; } if (!exp_info->resv) alloc_size += sizeof(struct dma_resv); else /* prevent &dma_buf[1] == dma_buf->resv */ alloc_size += 1; dmabuf = kzalloc(alloc_size, GFP_KERNEL); if (!dmabuf) { ret = -ENOMEM; goto err_file; } dmabuf->priv = exp_info->priv; dmabuf->ops = exp_info->ops; dmabuf->size = exp_info->size; dmabuf->exp_name = exp_info->exp_name; dmabuf->owner = exp_info->owner; spin_lock_init(&dmabuf->name_lock); init_waitqueue_head(&dmabuf->poll); dmabuf->cb_in.poll = dmabuf->cb_out.poll = &dmabuf->poll; dmabuf->cb_in.active = dmabuf->cb_out.active = 0; INIT_LIST_HEAD(&dmabuf->attachments); if (!resv) { dmabuf->resv = (struct dma_resv *)&dmabuf[1]; dma_resv_init(dmabuf->resv); } else { dmabuf->resv = resv; } ret = dma_buf_stats_setup(dmabuf, file); if (ret) goto err_dmabuf; file->private_data = dmabuf; file->f_path.dentry->d_fsdata = dmabuf; dmabuf->file = file; __dma_buf_debugfs_list_add(dmabuf); return dmabuf; err_dmabuf: if (!resv) dma_resv_fini(dmabuf->resv); kfree(dmabuf); err_file: fput(file); err_module: module_put(exp_info->owner); return ERR_PTR(ret); } EXPORT_SYMBOL_NS_GPL(dma_buf_export, "DMA_BUF"); /** * dma_buf_fd - returns a file descriptor for the given struct dma_buf * @dmabuf: [in] pointer to dma_buf for which fd is required. * @flags: [in] flags to give to fd * * On success, returns an associated 'fd'. Else, returns error. */ int dma_buf_fd(struct dma_buf *dmabuf, int flags) { int fd; if (!dmabuf || !dmabuf->file) return -EINVAL; fd = get_unused_fd_flags(flags); if (fd < 0) return fd; fd_install(fd, dmabuf->file); return fd; } EXPORT_SYMBOL_NS_GPL(dma_buf_fd, "DMA_BUF"); /** * dma_buf_get - returns the struct dma_buf related to an fd * @fd: [in] fd associated with the struct dma_buf to be returned * * On success, returns the struct dma_buf associated with an fd; uses * file's refcounting done by fget to increase refcount. returns ERR_PTR * otherwise. */ struct dma_buf *dma_buf_get(int fd) { struct file *file; file = fget(fd); if (!file) return ERR_PTR(-EBADF); if (!is_dma_buf_file(file)) { fput(file); return ERR_PTR(-EINVAL); } return file->private_data; } EXPORT_SYMBOL_NS_GPL(dma_buf_get, "DMA_BUF"); /** * dma_buf_put - decreases refcount of the buffer * @dmabuf: [in] buffer to reduce refcount of * * Uses file's refcounting done implicitly by fput(). * * If, as a result of this call, the refcount becomes 0, the 'release' file * operation related to this fd is called. It calls &dma_buf_ops.release vfunc * in turn, and frees the memory allocated for dmabuf when exported. */ void dma_buf_put(struct dma_buf *dmabuf) { if (WARN_ON(!dmabuf || !dmabuf->file)) return; fput(dmabuf->file); } EXPORT_SYMBOL_NS_GPL(dma_buf_put, "DMA_BUF"); static void mangle_sg_table(struct sg_table *sg_table) { #ifdef CONFIG_DMABUF_DEBUG int i; struct scatterlist *sg; /* To catch abuse of the underlying struct page by importers mix * up the bits, but take care to preserve the low SG_ bits to * not corrupt the sgt. The mixing is undone in __unmap_dma_buf * before passing the sgt back to the exporter. */ for_each_sgtable_sg(sg_table, sg, i) sg->page_link ^= ~0xffUL; #endif } static struct sg_table *__map_dma_buf(struct dma_buf_attachment *attach, enum dma_data_direction direction) { struct sg_table *sg_table; signed long ret; sg_table = attach->dmabuf->ops->map_dma_buf(attach, direction); if (IS_ERR_OR_NULL(sg_table)) return sg_table; if (!dma_buf_attachment_is_dynamic(attach)) { ret = dma_resv_wait_timeout(attach->dmabuf->resv, DMA_RESV_USAGE_KERNEL, true, MAX_SCHEDULE_TIMEOUT); if (ret < 0) { attach->dmabuf->ops->unmap_dma_buf(attach, sg_table, direction); return ERR_PTR(ret); } } mangle_sg_table(sg_table); return sg_table; } /** * DOC: locking convention * * In order to avoid deadlock situations between dma-buf exports and importers, * all dma-buf API users must follow the common dma-buf locking convention. * * Convention for importers * * 1. Importers must hold the dma-buf reservation lock when calling these * functions: * * - dma_buf_pin() * - dma_buf_unpin() * - dma_buf_map_attachment() * - dma_buf_unmap_attachment() * - dma_buf_vmap() * - dma_buf_vunmap() * * 2. Importers must not hold the dma-buf reservation lock when calling these * functions: * * - dma_buf_attach() * - dma_buf_dynamic_attach() * - dma_buf_detach() * - dma_buf_export() * - dma_buf_fd() * - dma_buf_get() * - dma_buf_put() * - dma_buf_mmap() * - dma_buf_begin_cpu_access() * - dma_buf_end_cpu_access() * - dma_buf_map_attachment_unlocked() * - dma_buf_unmap_attachment_unlocked() * - dma_buf_vmap_unlocked() * - dma_buf_vunmap_unlocked() * * Convention for exporters * * 1. These &dma_buf_ops callbacks are invoked with unlocked dma-buf * reservation and exporter can take the lock: * * - &dma_buf_ops.attach() * - &dma_buf_ops.detach() * - &dma_buf_ops.release() * - &dma_buf_ops.begin_cpu_access() * - &dma_buf_ops.end_cpu_access() * - &dma_buf_ops.mmap() * * 2. These &dma_buf_ops callbacks are invoked with locked dma-buf * reservation and exporter can't take the lock: * * - &dma_buf_ops.pin() * - &dma_buf_ops.unpin() * - &dma_buf_ops.map_dma_buf() * - &dma_buf_ops.unmap_dma_buf() * - &dma_buf_ops.vmap() * - &dma_buf_ops.vunmap() * * 3. Exporters must hold the dma-buf reservation lock when calling these * functions: * * - dma_buf_move_notify() */ /** * dma_buf_dynamic_attach - Add the device to dma_buf's attachments list * @dmabuf: [in] buffer to attach device to. * @dev: [in] device to be attached. * @importer_ops: [in] importer operations for the attachment * @importer_priv: [in] importer private pointer for the attachment * * Returns struct dma_buf_attachment pointer for this attachment. Attachments * must be cleaned up by calling dma_buf_detach(). * * Optionally this calls &dma_buf_ops.attach to allow device-specific attach * functionality. * * Returns: * * A pointer to newly created &dma_buf_attachment on success, or a negative * error code wrapped into a pointer on failure. * * Note that this can fail if the backing storage of @dmabuf is in a place not * accessible to @dev, and cannot be moved to a more suitable place. This is * indicated with the error code -EBUSY. */ struct dma_buf_attachment * dma_buf_dynamic_attach(struct dma_buf *dmabuf, struct device *dev, const struct dma_buf_attach_ops *importer_ops, void *importer_priv) { struct dma_buf_attachment *attach; int ret; if (WARN_ON(!dmabuf || !dev)) return ERR_PTR(-EINVAL); if (WARN_ON(importer_ops && !importer_ops->move_notify)) return ERR_PTR(-EINVAL); attach = kzalloc(sizeof(*attach), GFP_KERNEL); if (!attach) return ERR_PTR(-ENOMEM); attach->dev = dev; attach->dmabuf = dmabuf; if (importer_ops) attach->peer2peer = importer_ops->allow_peer2peer; attach->importer_ops = importer_ops; attach->importer_priv = importer_priv; if (dmabuf->ops->attach) { ret = dmabuf->ops->attach(dmabuf, attach); if (ret) goto err_attach; } dma_resv_lock(dmabuf->resv, NULL); list_add(&attach->node, &dmabuf->attachments); dma_resv_unlock(dmabuf->resv); /* When either the importer or the exporter can't handle dynamic * mappings we cache the mapping here to avoid issues with the * reservation object lock. */ if (dma_buf_attachment_is_dynamic(attach) != dma_buf_is_dynamic(dmabuf)) { struct sg_table *sgt; dma_resv_lock(attach->dmabuf->resv, NULL); if (dma_buf_is_dynamic(attach->dmabuf)) { ret = dmabuf->ops->pin(attach); if (ret) goto err_unlock; } sgt = __map_dma_buf(attach, DMA_BIDIRECTIONAL); if (!sgt) sgt = ERR_PTR(-ENOMEM); if (IS_ERR(sgt)) { ret = PTR_ERR(sgt); goto err_unpin; } dma_resv_unlock(attach->dmabuf->resv); attach->sgt = sgt; attach->dir = DMA_BIDIRECTIONAL; } return attach; err_attach: kfree(attach); return ERR_PTR(ret); err_unpin: if (dma_buf_is_dynamic(attach->dmabuf)) dmabuf->ops->unpin(attach); err_unlock: dma_resv_unlock(attach->dmabuf->resv); dma_buf_detach(dmabuf, attach); return ERR_PTR(ret); } EXPORT_SYMBOL_NS_GPL(dma_buf_dynamic_attach, "DMA_BUF"); /** * dma_buf_attach - Wrapper for dma_buf_dynamic_attach * @dmabuf: [in] buffer to attach device to. * @dev: [in] device to be attached. * * Wrapper to call dma_buf_dynamic_attach() for drivers which still use a static * mapping. */ struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf, struct device *dev) { return dma_buf_dynamic_attach(dmabuf, dev, NULL, NULL); } EXPORT_SYMBOL_NS_GPL(dma_buf_attach, "DMA_BUF"); static void __unmap_dma_buf(struct dma_buf_attachment *attach, struct sg_table *sg_table, enum dma_data_direction direction) { /* uses XOR, hence this unmangles */ mangle_sg_table(sg_table); attach->dmabuf->ops->unmap_dma_buf(attach, sg_table, direction); } /** * dma_buf_detach - Remove the given attachment from dmabuf's attachments list * @dmabuf: [in] buffer to detach from. * @attach: [in] attachment to be detached; is free'd after this call. * * Clean up a device attachment obtained by calling dma_buf_attach(). * * Optionally this calls &dma_buf_ops.detach for device-specific detach. */ void dma_buf_detach(struct dma_buf *dmabuf, struct dma_buf_attachment *attach) { if (WARN_ON(!dmabuf || !attach || dmabuf != attach->dmabuf)) return; dma_resv_lock(dmabuf->resv, NULL); if (attach->sgt) { __unmap_dma_buf(attach, attach->sgt, attach->dir); if (dma_buf_is_dynamic(attach->dmabuf)) dmabuf->ops->unpin(attach); } list_del(&attach->node); dma_resv_unlock(dmabuf->resv); if (dmabuf->ops->detach) dmabuf->ops->detach(dmabuf, attach); kfree(attach); } EXPORT_SYMBOL_NS_GPL(dma_buf_detach, "DMA_BUF"); /** * dma_buf_pin - Lock down the DMA-buf * @attach: [in] attachment which should be pinned * * Only dynamic importers (who set up @attach with dma_buf_dynamic_attach()) may * call this, and only for limited use cases like scanout and not for temporary * pin operations. It is not permitted to allow userspace to pin arbitrary * amounts of buffers through this interface. * * Buffers must be unpinned by calling dma_buf_unpin(). * * Returns: * 0 on success, negative error code on failure. */ int dma_buf_pin(struct dma_buf_attachment *attach) { struct dma_buf *dmabuf = attach->dmabuf; int ret = 0; WARN_ON(!dma_buf_attachment_is_dynamic(attach)); dma_resv_assert_held(dmabuf->resv); if (dmabuf->ops->pin) ret = dmabuf->ops->pin(attach); return ret; } EXPORT_SYMBOL_NS_GPL(dma_buf_pin, "DMA_BUF"); /** * dma_buf_unpin - Unpin a DMA-buf * @attach: [in] attachment which should be unpinned * * This unpins a buffer pinned by dma_buf_pin() and allows the exporter to move * any mapping of @attach again and inform the importer through * &dma_buf_attach_ops.move_notify. */ void dma_buf_unpin(struct dma_buf_attachment *attach) { struct dma_buf *dmabuf = attach->dmabuf; WARN_ON(!dma_buf_attachment_is_dynamic(attach)); dma_resv_assert_held(dmabuf->resv); if (dmabuf->ops->unpin) dmabuf->ops->unpin(attach); } EXPORT_SYMBOL_NS_GPL(dma_buf_unpin, "DMA_BUF"); /** * dma_buf_map_attachment - Returns the scatterlist table of the attachment; * mapped into _device_ address space. Is a wrapper for map_dma_buf() of the * dma_buf_ops. * @attach: [in] attachment whose scatterlist is to be returned * @direction: [in] direction of DMA transfer * * Returns sg_table containing the scatterlist to be returned; returns ERR_PTR * on error. May return -EINTR if it is interrupted by a signal. * * On success, the DMA addresses and lengths in the returned scatterlist are * PAGE_SIZE aligned. * * A mapping must be unmapped by using dma_buf_unmap_attachment(). Note that * the underlying backing storage is pinned for as long as a mapping exists, * therefore users/importers should not hold onto a mapping for undue amounts of * time. * * Important: Dynamic importers must wait for the exclusive fence of the struct * dma_resv attached to the DMA-BUF first. */ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach, enum dma_data_direction direction) { struct sg_table *sg_table; int r; might_sleep(); if (WARN_ON(!attach || !attach->dmabuf)) return ERR_PTR(-EINVAL); dma_resv_assert_held(attach->dmabuf->resv); if (attach->sgt) { /* * Two mappings with different directions for the same * attachment are not allowed. */ if (attach->dir != direction && attach->dir != DMA_BIDIRECTIONAL) return ERR_PTR(-EBUSY); return attach->sgt; } if (dma_buf_is_dynamic(attach->dmabuf)) { if (!IS_ENABLED(CONFIG_DMABUF_MOVE_NOTIFY)) { r = attach->dmabuf->ops->pin(attach); if (r) return ERR_PTR(r); } } sg_table = __map_dma_buf(attach, direction); if (!sg_table) sg_table = ERR_PTR(-ENOMEM); if (IS_ERR(sg_table) && dma_buf_is_dynamic(attach->dmabuf) && !IS_ENABLED(CONFIG_DMABUF_MOVE_NOTIFY)) attach->dmabuf->ops->unpin(attach); if (!IS_ERR(sg_table) && attach->dmabuf->ops->cache_sgt_mapping) { attach->sgt = sg_table; attach->dir = direction; } #ifdef CONFIG_DMA_API_DEBUG if (!IS_ERR(sg_table)) { struct scatterlist *sg; u64 addr; int len; int i; for_each_sgtable_dma_sg(sg_table, sg, i) { addr = sg_dma_address(sg); len = sg_dma_len(sg); if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(len)) { pr_debug("%s: addr %llx or len %x is not page aligned!\n", __func__, addr, len); } } } #endif /* CONFIG_DMA_API_DEBUG */ return sg_table; } EXPORT_SYMBOL_NS_GPL(dma_buf_map_attachment, "DMA_BUF"); /** * dma_buf_map_attachment_unlocked - Returns the scatterlist table of the attachment; * mapped into _device_ address space. Is a wrapper for map_dma_buf() of the * dma_buf_ops. * @attach: [in] attachment whose scatterlist is to be returned * @direction: [in] direction of DMA transfer * * Unlocked variant of dma_buf_map_attachment(). */ struct sg_table * dma_buf_map_attachment_unlocked(struct dma_buf_attachment *attach, enum dma_data_direction direction) { struct sg_table *sg_table; might_sleep(); if (WARN_ON(!attach || !attach->dmabuf)) return ERR_PTR(-EINVAL); dma_resv_lock(attach->dmabuf->resv, NULL); sg_table = dma_buf_map_attachment(attach, direction); dma_resv_unlock(attach->dmabuf->resv); return sg_table; } EXPORT_SYMBOL_NS_GPL(dma_buf_map_attachment_unlocked, "DMA_BUF"); /** * dma_buf_unmap_attachment - unmaps and decreases usecount of the buffer;might * deallocate the scatterlist associated. Is a wrapper for unmap_dma_buf() of * dma_buf_ops. * @attach: [in] attachment to unmap buffer from * @sg_table: [in] scatterlist info of the buffer to unmap * @direction: [in] direction of DMA transfer * * This unmaps a DMA mapping for @attached obtained by dma_buf_map_attachment(). */ void dma_buf_unmap_attachment(struct dma_buf_attachment *attach, struct sg_table *sg_table, enum dma_data_direction direction) { might_sleep(); if (WARN_ON(!attach || !attach->dmabuf || !sg_table)) return; dma_resv_assert_held(attach->dmabuf->resv); if (attach->sgt == sg_table) return; __unmap_dma_buf(attach, sg_table, direction); if (dma_buf_is_dynamic(attach->dmabuf) && !IS_ENABLED(CONFIG_DMABUF_MOVE_NOTIFY)) dma_buf_unpin(attach); } EXPORT_SYMBOL_NS_GPL(dma_buf_unmap_attachment, "DMA_BUF"); /** * dma_buf_unmap_attachment_unlocked - unmaps and decreases usecount of the buffer;might * deallocate the scatterlist associated. Is a wrapper for unmap_dma_buf() of * dma_buf_ops. * @attach: [in] attachment to unmap buffer from * @sg_table: [in] scatterlist info of the buffer to unmap * @direction: [in] direction of DMA transfer * * Unlocked variant of dma_buf_unmap_attachment(). */ void dma_buf_unmap_attachment_unlocked(struct dma_buf_attachment *attach, struct sg_table *sg_table, enum dma_data_direction direction) { might_sleep(); if (WARN_ON(!attach || !attach->dmabuf || !sg_table)) return; dma_resv_lock(attach->dmabuf->resv, NULL); dma_buf_unmap_attachment(attach, sg_table, direction); dma_resv_unlock(attach->dmabuf->resv); } EXPORT_SYMBOL_NS_GPL(dma_buf_unmap_attachment_unlocked, "DMA_BUF"); /** * dma_buf_move_notify - notify attachments that DMA-buf is moving * * @dmabuf: [in] buffer which is moving * * Informs all attachments that they need to destroy and recreate all their * mappings. */ void dma_buf_move_notify(struct dma_buf *dmabuf) { struct dma_buf_attachment *attach; dma_resv_assert_held(dmabuf->resv); list_for_each_entry(attach, &dmabuf->attachments, node) if (attach->importer_ops) attach->importer_ops->move_notify(attach); } EXPORT_SYMBOL_NS_GPL(dma_buf_move_notify, "DMA_BUF"); /** * DOC: cpu access * * There are multiple reasons for supporting CPU access to a dma buffer object: * * - Fallback operations in the kernel, for example when a device is connected * over USB and the kernel needs to shuffle the data around first before * sending it away. Cache coherency is handled by bracketing any transactions * with calls to dma_buf_begin_cpu_access() and dma_buf_end_cpu_access() * access. * * Since for most kernel internal dma-buf accesses need the entire buffer, a * vmap interface is introduced. Note that on very old 32-bit architectures * vmalloc space might be limited and result in vmap calls failing. * * Interfaces: * * .. code-block:: c * * void *dma_buf_vmap(struct dma_buf *dmabuf, struct iosys_map *map) * void dma_buf_vunmap(struct dma_buf *dmabuf, struct iosys_map *map) * * The vmap call can fail if there is no vmap support in the exporter, or if * it runs out of vmalloc space. Note that the dma-buf layer keeps a reference * count for all vmap access and calls down into the exporter's vmap function * only when no vmapping exists, and only unmaps it once. Protection against * concurrent vmap/vunmap calls is provided by taking the &dma_buf.lock mutex. * * - For full compatibility on the importer side with existing userspace * interfaces, which might already support mmap'ing buffers. This is needed in * many processing pipelines (e.g. feeding a software rendered image into a * hardware pipeline, thumbnail creation, snapshots, ...). Also, Android's ION * framework already supported this and for DMA buffer file descriptors to * replace ION buffers mmap support was needed. * * There is no special interfaces, userspace simply calls mmap on the dma-buf * fd. But like for CPU access there's a need to bracket the actual access, * which is handled by the ioctl (DMA_BUF_IOCTL_SYNC). Note that * DMA_BUF_IOCTL_SYNC can fail with -EAGAIN or -EINTR, in which case it must * be restarted. * * Some systems might need some sort of cache coherency management e.g. when * CPU and GPU domains are being accessed through dma-buf at the same time. * To circumvent this problem there are begin/end coherency markers, that * forward directly to existing dma-buf device drivers vfunc hooks. Userspace * can make use of those markers through the DMA_BUF_IOCTL_SYNC ioctl. The * sequence would be used like following: * * - mmap dma-buf fd * - for each drawing/upload cycle in CPU 1. SYNC_START ioctl, 2. read/write * to mmap area 3. SYNC_END ioctl. This can be repeated as often as you * want (with the new data being consumed by say the GPU or the scanout * device) * - munmap once you don't need the buffer any more * * For correctness and optimal performance, it is always required to use * SYNC_START and SYNC_END before and after, respectively, when accessing the * mapped address. Userspace cannot rely on coherent access, even when there * are systems where it just works without calling these ioctls. * * - And as a CPU fallback in userspace processing pipelines. * * Similar to the motivation for kernel cpu access it is again important that * the userspace code of a given importing subsystem can use the same * interfaces with a imported dma-buf buffer object as with a native buffer * object. This is especially important for drm where the userspace part of * contemporary OpenGL, X, and other drivers is huge, and reworking them to * use a different way to mmap a buffer rather invasive. * * The assumption in the current dma-buf interfaces is that redirecting the * initial mmap is all that's needed. A survey of some of the existing * subsystems shows that no driver seems to do any nefarious thing like * syncing up with outstanding asynchronous processing on the device or * allocating special resources at fault time. So hopefully this is good * enough, since adding interfaces to intercept pagefaults and allow pte * shootdowns would increase the complexity quite a bit. * * Interface: * * .. code-block:: c * * int dma_buf_mmap(struct dma_buf *, struct vm_area_struct *, unsigned long); * * If the importing subsystem simply provides a special-purpose mmap call to * set up a mapping in userspace, calling do_mmap with &dma_buf.file will * equally achieve that for a dma-buf object. */ static int __dma_buf_begin_cpu_access(struct dma_buf *dmabuf, enum dma_data_direction direction) { bool write = (direction == DMA_BIDIRECTIONAL || direction == DMA_TO_DEVICE); struct dma_resv *resv = dmabuf->resv; long ret; /* Wait on any implicit rendering fences */ ret = dma_resv_wait_timeout(resv, dma_resv_usage_rw(write), true, MAX_SCHEDULE_TIMEOUT); if (ret < 0) return ret; return 0; } /** * dma_buf_begin_cpu_access - Must be called before accessing a dma_buf from the * cpu in the kernel context. Calls begin_cpu_access to allow exporter-specific * preparations. Coherency is only guaranteed in the specified range for the * specified access direction. * @dmabuf: [in] buffer to prepare cpu access for. * @direction: [in] direction of access. * * After the cpu access is complete the caller should call * dma_buf_end_cpu_access(). Only when cpu access is bracketed by both calls is * it guaranteed to be coherent with other DMA access. * * This function will also wait for any DMA transactions tracked through * implicit synchronization in &dma_buf.resv. For DMA transactions with explicit * synchronization this function will only ensure cache coherency, callers must * ensure synchronization with such DMA transactions on their own. * * Can return negative error values, returns 0 on success. */ int dma_buf_begin_cpu_access(struct dma_buf *dmabuf, enum dma_data_direction direction) { int ret = 0; if (WARN_ON(!dmabuf)) return -EINVAL; might_lock(&dmabuf->resv->lock.base); if (dmabuf->ops->begin_cpu_access) ret = dmabuf->ops->begin_cpu_access(dmabuf, direction); /* Ensure that all fences are waited upon - but we first allow * the native handler the chance to do so more efficiently if it * chooses. A double invocation here will be reasonably cheap no-op. */ if (ret == 0) ret = __dma_buf_begin_cpu_access(dmabuf, direction); return ret; } EXPORT_SYMBOL_NS_GPL(dma_buf_begin_cpu_access, "DMA_BUF"); /** * dma_buf_end_cpu_access - Must be called after accessing a dma_buf from the * cpu in the kernel context. Calls end_cpu_access to allow exporter-specific * actions. Coherency is only guaranteed in the specified range for the * specified access direction. * @dmabuf: [in] buffer to complete cpu access for. * @direction: [in] direction of access. * * This terminates CPU access started with dma_buf_begin_cpu_access(). * * Can return negative error values, returns 0 on success. */ int dma_buf_end_cpu_access(struct dma_buf *dmabuf, enum dma_data_direction direction) { int ret = 0; WARN_ON(!dmabuf); might_lock(&dmabuf->resv->lock.base); if (dmabuf->ops->end_cpu_access) ret = dmabuf->ops->end_cpu_access(dmabuf, direction); return ret; } EXPORT_SYMBOL_NS_GPL(dma_buf_end_cpu_access, "DMA_BUF"); /** * dma_buf_mmap - Setup up a userspace mmap with the given vma * @dmabuf: [in] buffer that should back the vma * @vma: [in] vma for the mmap * @pgoff: [in] offset in pages where this mmap should start within the * dma-buf buffer. * * This function adjusts the passed in vma so that it points at the file of the * dma_buf operation. It also adjusts the starting pgoff and does bounds * checking on the size of the vma. Then it calls the exporters mmap function to * set up the mapping. * * Can return negative error values, returns 0 on success. */ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma, unsigned long pgoff) { if (WARN_ON(!dmabuf || !vma)) return -EINVAL; /* check if buffer supports mmap */ if (!dmabuf->ops->mmap) return -EINVAL; /* check for offset overflow */ if (pgoff + vma_pages(vma) < pgoff) return -EOVERFLOW; /* check for overflowing the buffer's size */ if (pgoff + vma_pages(vma) > dmabuf->size >> PAGE_SHIFT) return -EINVAL; /* readjust the vma */ vma_set_file(vma, dmabuf->file); vma->vm_pgoff = pgoff; return dmabuf->ops->mmap(dmabuf, vma); } EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, "DMA_BUF"); /** * dma_buf_vmap - Create virtual mapping for the buffer object into kernel * address space. Same restrictions as for vmap and friends apply. * @dmabuf: [in] buffer to vmap * @map: [out] returns the vmap pointer * * This call may fail due to lack of virtual mapping address space. * These calls are optional in drivers. The intended use for them * is for mapping objects linear in kernel space for high use objects. * * To ensure coherency users must call dma_buf_begin_cpu_access() and * dma_buf_end_cpu_access() around any cpu access performed through this * mapping. * * Returns 0 on success, or a negative errno code otherwise. */ int dma_buf_vmap(struct dma_buf *dmabuf, struct iosys_map *map) { struct iosys_map ptr; int ret; iosys_map_clear(map); if (WARN_ON(!dmabuf)) return -EINVAL; dma_resv_assert_held(dmabuf->resv); if (!dmabuf->ops->vmap) return -EINVAL; if (dmabuf->vmapping_counter) { dmabuf->vmapping_counter++; BUG_ON(iosys_map_is_null(&dmabuf->vmap_ptr)); *map = dmabuf->vmap_ptr; return 0; } BUG_ON(iosys_map_is_set(&dmabuf->vmap_ptr)); ret = dmabuf->ops->vmap(dmabuf, &ptr); if (WARN_ON_ONCE(ret)) return ret; dmabuf->vmap_ptr = ptr; dmabuf->vmapping_counter = 1; *map = dmabuf->vmap_ptr; return 0; } EXPORT_SYMBOL_NS_GPL(dma_buf_vmap, "DMA_BUF"); /** * dma_buf_vmap_unlocked - Create virtual mapping for the buffer object into kernel * address space. Same restrictions as for vmap and friends apply. * @dmabuf: [in] buffer to vmap * @map: [out] returns the vmap pointer * * Unlocked version of dma_buf_vmap() * * Returns 0 on success, or a negative errno code otherwise. */ int dma_buf_vmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map) { int ret; iosys_map_clear(map); if (WARN_ON(!dmabuf)) return -EINVAL; dma_resv_lock(dmabuf->resv, NULL); ret = dma_buf_vmap(dmabuf, map); dma_resv_unlock(dmabuf->resv); return ret; } EXPORT_SYMBOL_NS_GPL(dma_buf_vmap_unlocked, "DMA_BUF"); /** * dma_buf_vunmap - Unmap a vmap obtained by dma_buf_vmap. * @dmabuf: [in] buffer to vunmap * @map: [in] vmap pointer to vunmap */ void dma_buf_vunmap(struct dma_buf *dmabuf, struct iosys_map *map) { if (WARN_ON(!dmabuf)) return; dma_resv_assert_held(dmabuf->resv); BUG_ON(iosys_map_is_null(&dmabuf->vmap_ptr)); BUG_ON(dmabuf->vmapping_counter == 0); BUG_ON(!iosys_map_is_equal(&dmabuf->vmap_ptr, map)); if (--dmabuf->vmapping_counter == 0) { if (dmabuf->ops->vunmap) dmabuf->ops->vunmap(dmabuf, map); iosys_map_clear(&dmabuf->vmap_ptr); } } EXPORT_SYMBOL_NS_GPL(dma_buf_vunmap, "DMA_BUF"); /** * dma_buf_vunmap_unlocked - Unmap a vmap obtained by dma_buf_vmap. * @dmabuf: [in] buffer to vunmap * @map: [in] vmap pointer to vunmap */ void dma_buf_vunmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map) { if (WARN_ON(!dmabuf)) return; dma_resv_lock(dmabuf->resv, NULL); dma_buf_vunmap(dmabuf, map); dma_resv_unlock(dmabuf->resv); } EXPORT_SYMBOL_NS_GPL(dma_buf_vunmap_unlocked, "DMA_BUF"); #ifdef CONFIG_DEBUG_FS static int dma_buf_debug_show(struct seq_file *s, void *unused) { struct dma_buf *buf_obj; struct dma_buf_attachment *attach_obj; int count = 0, attach_count; size_t size = 0; int ret; ret = mutex_lock_interruptible(&debugfs_list_mutex); if (ret) return ret; seq_puts(s, "\nDma-buf Objects:\n"); seq_printf(s, "%-8s\t%-8s\t%-8s\t%-8s\texp_name\t%-8s\tname\n", "size", "flags", "mode", "count", "ino"); list_for_each_entry(buf_obj, &debugfs_list, list_node) { ret = dma_resv_lock_interruptible(buf_obj->resv, NULL); if (ret) goto error_unlock; spin_lock(&buf_obj->name_lock); seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\t%s\n", buf_obj->size, buf_obj->file->f_flags, buf_obj->file->f_mode, file_count(buf_obj->file), buf_obj->exp_name, file_inode(buf_obj->file)->i_ino, buf_obj->name ?: "<none>"); spin_unlock(&buf_obj->name_lock); dma_resv_describe(buf_obj->resv, s); seq_puts(s, "\tAttached Devices:\n"); attach_count = 0; list_for_each_entry(attach_obj, &buf_obj->attachments, node) { seq_printf(s, "\t%s\n", dev_name(attach_obj->dev)); attach_count++; } dma_resv_unlock(buf_obj->resv); seq_printf(s, "Total %d devices attached\n\n", attach_count); count++; size += buf_obj->size; } seq_printf(s, "\nTotal %d objects, %zu bytes\n", count, size); mutex_unlock(&debugfs_list_mutex); return 0; error_unlock: mutex_unlock(&debugfs_list_mutex); return ret; } DEFINE_SHOW_ATTRIBUTE(dma_buf_debug); static struct dentry *dma_buf_debugfs_dir; static int dma_buf_init_debugfs(void) { struct dentry *d; int err = 0; d = debugfs_create_dir("dma_buf", NULL); if (IS_ERR(d)) return PTR_ERR(d); dma_buf_debugfs_dir = d; d = debugfs_create_file("bufinfo", 0444, dma_buf_debugfs_dir, NULL, &dma_buf_debug_fops); if (IS_ERR(d)) { pr_debug("dma_buf: debugfs: failed to create node bufinfo\n"); debugfs_remove_recursive(dma_buf_debugfs_dir); dma_buf_debugfs_dir = NULL; err = PTR_ERR(d); } return err; } static void dma_buf_uninit_debugfs(void) { debugfs_remove_recursive(dma_buf_debugfs_dir); } #else static inline int dma_buf_init_debugfs(void) { return 0; } static inline void dma_buf_uninit_debugfs(void) { } #endif static int __init dma_buf_init(void) { int ret; ret = dma_buf_init_sysfs_statistics(); if (ret) return ret; dma_buf_mnt = kern_mount(&dma_buf_fs_type); if (IS_ERR(dma_buf_mnt)) return PTR_ERR(dma_buf_mnt); dma_buf_init_debugfs(); return 0; } subsys_initcall(dma_buf_init); static void __exit dma_buf_deinit(void) { dma_buf_uninit_debugfs(); kern_unmount(dma_buf_mnt); dma_buf_uninit_sysfs_statistics(); } __exitcall(dma_buf_deinit); |
| 104 104 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/ife.c Inter-FE action based on ForCES WG InterFE LFB * * Refer to: * draft-ietf-forces-interfelfb-03 * and * netdev01 paper: * "Distributing Linux Traffic Control Classifier-Action * Subsystem" * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai * * copyright Jamal Hadi Salim (2015) */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <uapi/linux/tc_act/tc_ife.h> #include <net/tc_act/tc_ife.h> #include <linux/etherdevice.h> #include <net/ife.h> #include <net/tc_wrapper.h> static int max_metacnt = IFE_META_MAX + 1; static struct tc_action_ops act_ife_ops; static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = { [TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)}, [TCA_IFE_DMAC] = { .len = ETH_ALEN}, [TCA_IFE_SMAC] = { .len = ETH_ALEN}, [TCA_IFE_TYPE] = { .type = NLA_U16}, }; int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi) { u16 edata = 0; if (mi->metaval) edata = *(u16 *)mi->metaval; else if (metaval) edata = metaval; if (!edata) /* will not encode */ return 0; edata = htons(edata); return ife_tlv_meta_encode(skbdata, mi->metaid, 2, &edata); } EXPORT_SYMBOL_GPL(ife_encode_meta_u16); int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi) { if (mi->metaval) return nla_put_u32(skb, mi->metaid, *(u32 *)mi->metaval); else return nla_put(skb, mi->metaid, 0, NULL); } EXPORT_SYMBOL_GPL(ife_get_meta_u32); int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi) { if (metaval || mi->metaval) return 8; /* T+L+V == 2+2+4 */ return 0; } EXPORT_SYMBOL_GPL(ife_check_meta_u32); int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi) { if (metaval || mi->metaval) return 8; /* T+L+(V) == 2+2+(2+2bytepad) */ return 0; } EXPORT_SYMBOL_GPL(ife_check_meta_u16); int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi) { u32 edata = metaval; if (mi->metaval) edata = *(u32 *)mi->metaval; else if (metaval) edata = metaval; if (!edata) /* will not encode */ return 0; edata = htonl(edata); return ife_tlv_meta_encode(skbdata, mi->metaid, 4, &edata); } EXPORT_SYMBOL_GPL(ife_encode_meta_u32); int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi) { if (mi->metaval) return nla_put_u16(skb, mi->metaid, *(u16 *)mi->metaval); else return nla_put(skb, mi->metaid, 0, NULL); } EXPORT_SYMBOL_GPL(ife_get_meta_u16); int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval, gfp_t gfp) { mi->metaval = kmemdup(metaval, sizeof(u32), gfp); if (!mi->metaval) return -ENOMEM; return 0; } EXPORT_SYMBOL_GPL(ife_alloc_meta_u32); int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval, gfp_t gfp) { mi->metaval = kmemdup(metaval, sizeof(u16), gfp); if (!mi->metaval) return -ENOMEM; return 0; } EXPORT_SYMBOL_GPL(ife_alloc_meta_u16); void ife_release_meta_gen(struct tcf_meta_info *mi) { kfree(mi->metaval); } EXPORT_SYMBOL_GPL(ife_release_meta_gen); int ife_validate_meta_u32(void *val, int len) { if (len == sizeof(u32)) return 0; return -EINVAL; } EXPORT_SYMBOL_GPL(ife_validate_meta_u32); int ife_validate_meta_u16(void *val, int len) { /* length will not include padding */ if (len == sizeof(u16)) return 0; return -EINVAL; } EXPORT_SYMBOL_GPL(ife_validate_meta_u16); static LIST_HEAD(ifeoplist); static DEFINE_RWLOCK(ife_mod_lock); static struct tcf_meta_ops *find_ife_oplist(u16 metaid) { struct tcf_meta_ops *o; read_lock(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { if (o->metaid == metaid) { if (!try_module_get(o->owner)) o = NULL; read_unlock(&ife_mod_lock); return o; } } read_unlock(&ife_mod_lock); return NULL; } int register_ife_op(struct tcf_meta_ops *mops) { struct tcf_meta_ops *m; if (!mops->metaid || !mops->metatype || !mops->name || !mops->check_presence || !mops->encode || !mops->decode || !mops->get || !mops->alloc) return -EINVAL; write_lock(&ife_mod_lock); list_for_each_entry(m, &ifeoplist, list) { if (m->metaid == mops->metaid || (strcmp(mops->name, m->name) == 0)) { write_unlock(&ife_mod_lock); return -EEXIST; } } if (!mops->release) mops->release = ife_release_meta_gen; list_add_tail(&mops->list, &ifeoplist); write_unlock(&ife_mod_lock); return 0; } EXPORT_SYMBOL_GPL(unregister_ife_op); int unregister_ife_op(struct tcf_meta_ops *mops) { struct tcf_meta_ops *m; int err = -ENOENT; write_lock(&ife_mod_lock); list_for_each_entry(m, &ifeoplist, list) { if (m->metaid == mops->metaid) { list_del(&mops->list); err = 0; break; } } write_unlock(&ife_mod_lock); return err; } EXPORT_SYMBOL_GPL(register_ife_op); static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len) { int ret = 0; /* XXX: unfortunately cant use nla_policy at this point * because a length of 0 is valid in the case of * "allow". "use" semantics do enforce for proper * length and i couldve use nla_policy but it makes it hard * to use it just for that.. */ if (ops->validate) return ops->validate(val, len); if (ops->metatype == NLA_U32) ret = ife_validate_meta_u32(val, len); else if (ops->metatype == NLA_U16) ret = ife_validate_meta_u16(val, len); return ret; } #ifdef CONFIG_MODULES static const char *ife_meta_id2name(u32 metaid) { switch (metaid) { case IFE_META_SKBMARK: return "skbmark"; case IFE_META_PRIO: return "skbprio"; case IFE_META_TCINDEX: return "tcindex"; default: return "unknown"; } } #endif /* called when adding new meta information */ static int load_metaops_and_vet(u32 metaid, void *val, int len, bool rtnl_held) { struct tcf_meta_ops *ops = find_ife_oplist(metaid); int ret = 0; if (!ops) { ret = -ENOENT; #ifdef CONFIG_MODULES if (rtnl_held) rtnl_unlock(); request_module("ife-meta-%s", ife_meta_id2name(metaid)); if (rtnl_held) rtnl_lock(); ops = find_ife_oplist(metaid); #endif } if (ops) { ret = 0; if (len) ret = ife_validate_metatype(ops, val, len); module_put(ops->owner); } return ret; } /* called when adding new meta information */ static int __add_metainfo(const struct tcf_meta_ops *ops, struct tcf_ife_info *ife, u32 metaid, void *metaval, int len, bool atomic, bool exists) { struct tcf_meta_info *mi = NULL; int ret = 0; mi = kzalloc(sizeof(*mi), atomic ? GFP_ATOMIC : GFP_KERNEL); if (!mi) return -ENOMEM; mi->metaid = metaid; mi->ops = ops; if (len > 0) { ret = ops->alloc(mi, metaval, atomic ? GFP_ATOMIC : GFP_KERNEL); if (ret != 0) { kfree(mi); return ret; } } if (exists) spin_lock_bh(&ife->tcf_lock); list_add_tail(&mi->metalist, &ife->metalist); if (exists) spin_unlock_bh(&ife->tcf_lock); return ret; } static int add_metainfo_and_get_ops(const struct tcf_meta_ops *ops, struct tcf_ife_info *ife, u32 metaid, bool exists) { int ret; if (!try_module_get(ops->owner)) return -ENOENT; ret = __add_metainfo(ops, ife, metaid, NULL, 0, true, exists); if (ret) module_put(ops->owner); return ret; } static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval, int len, bool exists) { const struct tcf_meta_ops *ops = find_ife_oplist(metaid); int ret; if (!ops) return -ENOENT; ret = __add_metainfo(ops, ife, metaid, metaval, len, false, exists); if (ret) /*put back what find_ife_oplist took */ module_put(ops->owner); return ret; } static int use_all_metadata(struct tcf_ife_info *ife, bool exists) { struct tcf_meta_ops *o; int rc = 0; int installed = 0; read_lock(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { rc = add_metainfo_and_get_ops(o, ife, o->metaid, exists); if (rc == 0) installed += 1; } read_unlock(&ife_mod_lock); if (installed) return 0; else return -EINVAL; } static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife) { struct tcf_meta_info *e; struct nlattr *nest; unsigned char *b = skb_tail_pointer(skb); int total_encoded = 0; /*can only happen on decode */ if (list_empty(&ife->metalist)) return 0; nest = nla_nest_start_noflag(skb, TCA_IFE_METALST); if (!nest) goto out_nlmsg_trim; list_for_each_entry(e, &ife->metalist, metalist) { if (!e->ops->get(skb, e)) total_encoded += 1; } if (!total_encoded) goto out_nlmsg_trim; nla_nest_end(skb, nest); return 0; out_nlmsg_trim: nlmsg_trim(skb, b); return -1; } /* under ife->tcf_lock */ static void _tcf_ife_cleanup(struct tc_action *a) { struct tcf_ife_info *ife = to_ife(a); struct tcf_meta_info *e, *n; list_for_each_entry_safe(e, n, &ife->metalist, metalist) { list_del(&e->metalist); if (e->metaval) { if (e->ops->release) e->ops->release(e); else kfree(e->metaval); } module_put(e->ops->owner); kfree(e); } } static void tcf_ife_cleanup(struct tc_action *a) { struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; spin_lock_bh(&ife->tcf_lock); _tcf_ife_cleanup(a); spin_unlock_bh(&ife->tcf_lock); p = rcu_dereference_protected(ife->params, 1); if (p) kfree_rcu(p, rcu); } static int load_metalist(struct nlattr **tb, bool rtnl_held) { int i; for (i = 1; i < max_metacnt; i++) { if (tb[i]) { void *val = nla_data(tb[i]); int len = nla_len(tb[i]); int rc; rc = load_metaops_and_vet(i, val, len, rtnl_held); if (rc != 0) return rc; } } return 0; } static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, bool exists, bool rtnl_held) { int len = 0; int rc = 0; int i = 0; void *val; for (i = 1; i < max_metacnt; i++) { if (tb[i]) { val = nla_data(tb[i]); len = nla_len(tb[i]); rc = add_metainfo(ife, i, val, len, exists); if (rc) return rc; } } return rc; } static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_ife_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_IFE_MAX + 1]; struct nlattr *tb2[IFE_META_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tcf_ife_params *p; struct tcf_ife_info *ife; u16 ife_type = ETH_P_IFE; struct tc_ife *parm; u8 *daddr = NULL; u8 *saddr = NULL; bool exists = false; int ret = 0; u32 index; int err; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "IFE requires attributes to be passed"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_IFE_MAX, nla, ife_policy, NULL); if (err < 0) return err; if (!tb[TCA_IFE_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_IFE_PARMS]); /* IFE_DECODE is 0 and indicates the opposite of IFE_ENCODE because * they cannot run as the same time. Check on all other values which * are not supported right now. */ if (parm->flags & ~IFE_ENCODE) return -EINVAL; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return -ENOMEM; if (tb[TCA_IFE_METALST]) { err = nla_parse_nested_deprecated(tb2, IFE_META_MAX, tb[TCA_IFE_METALST], NULL, NULL); if (err) { kfree(p); return err; } err = load_metalist(tb2, !(flags & TCA_ACT_FLAGS_NO_RTNL)); if (err) { kfree(p); return err; } } index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) { kfree(p); return err; } exists = err; if (exists && bind) { kfree(p); return ACT_P_BOUND; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_ife_ops, bind, true, flags); if (ret) { tcf_idr_cleanup(tn, index); kfree(p); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); kfree(p); return -EEXIST; } ife = to_ife(*a); if (ret == ACT_P_CREATED) INIT_LIST_HEAD(&ife->metalist); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; p->flags = parm->flags; if (parm->flags & IFE_ENCODE) { if (tb[TCA_IFE_TYPE]) ife_type = nla_get_u16(tb[TCA_IFE_TYPE]); if (tb[TCA_IFE_DMAC]) daddr = nla_data(tb[TCA_IFE_DMAC]); if (tb[TCA_IFE_SMAC]) saddr = nla_data(tb[TCA_IFE_SMAC]); } if (parm->flags & IFE_ENCODE) { if (daddr) ether_addr_copy(p->eth_dst, daddr); else eth_zero_addr(p->eth_dst); if (saddr) ether_addr_copy(p->eth_src, saddr); else eth_zero_addr(p->eth_src); p->eth_type = ife_type; } if (tb[TCA_IFE_METALST]) { err = populate_metalist(ife, tb2, exists, !(flags & TCA_ACT_FLAGS_NO_RTNL)); if (err) goto metadata_parse_err; } else { /* if no passed metadata allow list or passed allow-all * then here we process by adding as many supported metadatum * as we can. You better have at least one else we are * going to bail out */ err = use_all_metadata(ife, exists); if (err) goto metadata_parse_err; } if (exists) spin_lock_bh(&ife->tcf_lock); /* protected by tcf_lock when modifying existing action */ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(ife->params, p, 1); if (exists) spin_unlock_bh(&ife->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (p) kfree_rcu(p, rcu); return ret; metadata_parse_err: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: kfree(p); tcf_idr_release(*a, bind); return err; } static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; struct tc_ife opt = { .index = ife->tcf_index, .refcnt = refcount_read(&ife->tcf_refcnt) - ref, .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&ife->tcf_lock); opt.action = ife->tcf_action; p = rcu_dereference_protected(ife->params, lockdep_is_held(&ife->tcf_lock)); opt.flags = p->flags; if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &ife->tcf_tm); if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD)) goto nla_put_failure; if (!is_zero_ether_addr(p->eth_dst)) { if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, p->eth_dst)) goto nla_put_failure; } if (!is_zero_ether_addr(p->eth_src)) { if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, p->eth_src)) goto nla_put_failure; } if (nla_put(skb, TCA_IFE_TYPE, 2, &p->eth_type)) goto nla_put_failure; if (dump_metalist(skb, ife)) { /*ignore failure to dump metalist */ pr_info("Failed to dump metalist\n"); } spin_unlock_bh(&ife->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&ife->tcf_lock); nlmsg_trim(skb, b); return -1; } static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife, u16 metaid, u16 mlen, void *mdata) { struct tcf_meta_info *e; /* XXX: use hash to speed up */ list_for_each_entry(e, &ife->metalist, metalist) { if (metaid == e->metaid) { if (e->ops) { /* We check for decode presence already */ return e->ops->decode(skb, mdata, mlen); } } } return -ENOENT; } static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; u8 *ifehdr_end; u8 *tlv_data; u16 metalen; bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (skb_at_tc_ingress(skb)) skb_push(skb, skb->dev->hard_header_len); tlv_data = ife_decode(skb, &metalen); if (unlikely(!tlv_data)) { qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } ifehdr_end = tlv_data + metalen; for (; tlv_data < ifehdr_end; tlv_data = ife_tlv_meta_next(tlv_data)) { u8 *curr_data; u16 mtype; u16 dlen; curr_data = ife_tlv_meta_decode(tlv_data, ifehdr_end, &mtype, &dlen, NULL); if (!curr_data) { qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) { /* abuse overlimits to count when we receive metadata * but dont have an ops for it */ pr_info_ratelimited("Unknown metaid %d dlen %d\n", mtype, dlen); qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats)); } } if (WARN_ON(tlv_data != ifehdr_end)) { qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } skb->protocol = eth_type_trans(skb, skb->dev); skb_reset_network_header(skb); return action; } /*XXX: check if we can do this at install time instead of current * send data path **/ static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife) { struct tcf_meta_info *e, *n; int tot_run_sz = 0, run_sz = 0; list_for_each_entry_safe(e, n, &ife->metalist, metalist) { if (e->ops->check_presence) { run_sz = e->ops->check_presence(skb, e); tot_run_sz += run_sz; } } return tot_run_sz; } static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res, struct tcf_ife_params *p) { struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; struct ethhdr *oethh; /* outer ether header */ struct tcf_meta_info *e; /* OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA where ORIGDATA = original ethernet header ... */ u16 metalen = ife_get_sz(skb, ife); int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN; unsigned int skboff = 0; int new_len = skb->len + hdrm; bool exceed_mtu = false; void *ife_meta; int err = 0; if (!skb_at_tc_ingress(skb)) { if (new_len > skb->dev->mtu) exceed_mtu = true; } bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (!metalen) { /* no metadata to send */ /* abuse overlimits to count when we allow packet * with no metadata */ qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats)); return action; } /* could be stupid policy setup or mtu config * so lets be conservative.. */ if ((action == TC_ACT_SHOT) || exceed_mtu) { qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } if (skb_at_tc_ingress(skb)) skb_push(skb, skb->dev->hard_header_len); ife_meta = ife_encode(skb, metalen); spin_lock(&ife->tcf_lock); /* XXX: we dont have a clever way of telling encode to * not repeat some of the computations that are done by * ops->presence_check... */ list_for_each_entry(e, &ife->metalist, metalist) { if (e->ops->encode) { err = e->ops->encode(skb, (void *)(ife_meta + skboff), e); } if (err < 0) { /* too corrupt to keep around if overwritten */ spin_unlock(&ife->tcf_lock); qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } skboff += err; } spin_unlock(&ife->tcf_lock); oethh = (struct ethhdr *)skb->data; if (!is_zero_ether_addr(p->eth_src)) ether_addr_copy(oethh->h_source, p->eth_src); if (!is_zero_ether_addr(p->eth_dst)) ether_addr_copy(oethh->h_dest, p->eth_dst); oethh->h_proto = htons(p->eth_type); if (skb_at_tc_ingress(skb)) skb_pull(skb, skb->dev->hard_header_len); return action; } TC_INDIRECT_SCOPE int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; int ret; p = rcu_dereference_bh(ife->params); if (p->flags & IFE_ENCODE) { ret = tcf_ife_encode(skb, a, res, p); return ret; } return tcf_ife_decode(skb, a, res); } static struct tc_action_ops act_ife_ops = { .kind = "ife", .id = TCA_ID_IFE, .owner = THIS_MODULE, .act = tcf_ife_act, .dump = tcf_ife_dump, .cleanup = tcf_ife_cleanup, .init = tcf_ife_init, .size = sizeof(struct tcf_ife_info), }; MODULE_ALIAS_NET_ACT("ife"); static __net_init int ife_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_ife_ops.net_id); return tc_action_net_init(net, tn, &act_ife_ops); } static void __net_exit ife_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_ife_ops.net_id); } static struct pernet_operations ife_net_ops = { .init = ife_init_net, .exit_batch = ife_exit_net, .id = &act_ife_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init ife_init_module(void) { return tcf_register_action(&act_ife_ops, &ife_net_ops); } static void __exit ife_cleanup_module(void) { tcf_unregister_action(&act_ife_ops, &ife_net_ops); } module_init(ife_init_module); module_exit(ife_cleanup_module); MODULE_AUTHOR("Jamal Hadi Salim(2015)"); MODULE_DESCRIPTION("Inter-FE LFB action"); MODULE_LICENSE("GPL"); |
| 60 60 60 60 60 60 12 26 47 47 6 6 10 16 16 68 40 34 2 9 1 76 74 74 74 15 15 15 3 79 79 79 25 7 1 5 24 9 3 13 1 14 7 2 2 2 4 4 4 4 18 3 1 2 2 3 2 2 3 5 2 4 1 2 34 12 1 11 2 9 55 8 2 12 12 34 7 7 7 7 74 40 6 68 7 67 68 6 74 57 14 1 55 3 26 19 34 23 1 1 2 1 1 1 1 19 1 19 18 1 3 16 11 1 1 1 3 1 13 13 12 1 6 131 2 142 145 16 128 8 136 139 5 144 144 144 129 2 23 103 97 97 22 1 77 16 83 10 40 69 2 8 2 48 24 71 38 1 39 31 3 24 10 2 32 30 12 1 2 5 5 1 11 4 12 3 4 3 5 2 75 4 57 15 65 7 67 5 65 7 65 7 52 19 39 33 70 2 70 2 70 2 66 6 70 2 49 23 16 16 19 19 1 18 19 19 24 61 63 10 56 56 20 34 2 49 3 44 6 37 4 33 12 1 11 3 2 6 3 12 6 6 6 2 1 1 190 248 228 28 2 22 13 1 1 1 1 1 1 1 1 3 1 222 37 23 23 23 17 6 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 | // SPDX-License-Identifier: GPL-2.0-or-later /* * GRE over IPv6 protocol decoder. * * Authors: Dmitry Kozlov (xeb@mail.ru) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/in6.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/etherdevice.h> #include <linux/if_ether.h> #include <linux/hash.h> #include <linux/if_tunnel.h> #include <linux/ip6_tunnel.h> #include <net/sock.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/addrconf.h> #include <net/arp.h> #include <net/checksum.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/ip6_tunnel.h> #include <net/gre.h> #include <net/erspan.h> #include <net/dst_metadata.h> static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); #define IP6_GRE_HASH_SIZE_SHIFT 5 #define IP6_GRE_HASH_SIZE (1 << IP6_GRE_HASH_SIZE_SHIFT) static unsigned int ip6gre_net_id __read_mostly; struct ip6gre_net { struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE]; struct ip6_tnl __rcu *collect_md_tun; struct ip6_tnl __rcu *collect_md_tun_erspan; struct net_device *fb_tunnel_dev; }; static struct rtnl_link_ops ip6gre_link_ops __read_mostly; static struct rtnl_link_ops ip6gre_tap_ops __read_mostly; static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly; static int ip6gre_tunnel_init(struct net_device *dev); static void ip6gre_tunnel_setup(struct net_device *dev); static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t); static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu); static void ip6erspan_tnl_link_config(struct ip6_tnl *t, int set_mtu); /* Tunnel hash table */ /* 4 hash tables: 3: (remote,local) 2: (remote,*) 1: (*,local) 0: (*,*) We require exact key match i.e. if a key is present in packet it will match only tunnel with the same key; if it is not present, it will match only keyless tunnel. All keysless packets, if not matched configured keyless tunnels will match fallback tunnel. */ #define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(IP6_GRE_HASH_SIZE - 1)) static u32 HASH_ADDR(const struct in6_addr *addr) { u32 hash = ipv6_addr_hash(addr); return hash_32(hash, IP6_GRE_HASH_SIZE_SHIFT); } #define tunnels_r_l tunnels[3] #define tunnels_r tunnels[2] #define tunnels_l tunnels[1] #define tunnels_wc tunnels[0] /* Given src, dst and key, find appropriate for input tunnel. */ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, const struct in6_addr *remote, const struct in6_addr *local, __be32 key, __be16 gre_proto) { struct net *net = dev_net(dev); int link = dev->ifindex; unsigned int h0 = HASH_ADDR(remote); unsigned int h1 = HASH_KEY(key); struct ip6_tnl *t, *cand = NULL; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); int dev_type = (gre_proto == htons(ETH_P_TEB) || gre_proto == htons(ETH_P_ERSPAN) || gre_proto == htons(ETH_P_ERSPAN2)) ? ARPHRD_ETHER : ARPHRD_IP6GRE; int score, cand_score = 4; struct net_device *ndev; for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_equal(remote, &t->parms.raddr) || key != t->parms.i_key || !(t->dev->flags & IFF_UP)) continue; if (t->dev->type != ARPHRD_IP6GRE && t->dev->type != dev_type) continue; score = 0; if (t->parms.link != link) score |= 1; if (t->dev->type != dev_type) score |= 2; if (score == 0) return t; if (score < cand_score) { cand = t; cand_score = score; } } for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) { if (!ipv6_addr_equal(remote, &t->parms.raddr) || key != t->parms.i_key || !(t->dev->flags & IFF_UP)) continue; if (t->dev->type != ARPHRD_IP6GRE && t->dev->type != dev_type) continue; score = 0; if (t->parms.link != link) score |= 1; if (t->dev->type != dev_type) score |= 2; if (score == 0) return t; if (score < cand_score) { cand = t; cand_score = score; } } for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) { if ((!ipv6_addr_equal(local, &t->parms.laddr) && (!ipv6_addr_equal(local, &t->parms.raddr) || !ipv6_addr_is_multicast(local))) || key != t->parms.i_key || !(t->dev->flags & IFF_UP)) continue; if (t->dev->type != ARPHRD_IP6GRE && t->dev->type != dev_type) continue; score = 0; if (t->parms.link != link) score |= 1; if (t->dev->type != dev_type) score |= 2; if (score == 0) return t; if (score < cand_score) { cand = t; cand_score = score; } } for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) { if (t->parms.i_key != key || !(t->dev->flags & IFF_UP)) continue; if (t->dev->type != ARPHRD_IP6GRE && t->dev->type != dev_type) continue; score = 0; if (t->parms.link != link) score |= 1; if (t->dev->type != dev_type) score |= 2; if (score == 0) return t; if (score < cand_score) { cand = t; cand_score = score; } } if (cand) return cand; if (gre_proto == htons(ETH_P_ERSPAN) || gre_proto == htons(ETH_P_ERSPAN2)) t = rcu_dereference(ign->collect_md_tun_erspan); else t = rcu_dereference(ign->collect_md_tun); if (t && t->dev->flags & IFF_UP) return t; ndev = READ_ONCE(ign->fb_tunnel_dev); if (ndev && ndev->flags & IFF_UP) return netdev_priv(ndev); return NULL; } static struct ip6_tnl __rcu **__ip6gre_bucket(struct ip6gre_net *ign, const struct __ip6_tnl_parm *p) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; unsigned int h = HASH_KEY(p->i_key); int prio = 0; if (!ipv6_addr_any(local)) prio |= 1; if (!ipv6_addr_any(remote) && !ipv6_addr_is_multicast(remote)) { prio |= 2; h ^= HASH_ADDR(remote); } return &ign->tunnels[prio][h]; } static void ip6gre_tunnel_link_md(struct ip6gre_net *ign, struct ip6_tnl *t) { if (t->parms.collect_md) rcu_assign_pointer(ign->collect_md_tun, t); } static void ip6erspan_tunnel_link_md(struct ip6gre_net *ign, struct ip6_tnl *t) { if (t->parms.collect_md) rcu_assign_pointer(ign->collect_md_tun_erspan, t); } static void ip6gre_tunnel_unlink_md(struct ip6gre_net *ign, struct ip6_tnl *t) { if (t->parms.collect_md) rcu_assign_pointer(ign->collect_md_tun, NULL); } static void ip6erspan_tunnel_unlink_md(struct ip6gre_net *ign, struct ip6_tnl *t) { if (t->parms.collect_md) rcu_assign_pointer(ign->collect_md_tun_erspan, NULL); } static inline struct ip6_tnl __rcu **ip6gre_bucket(struct ip6gre_net *ign, const struct ip6_tnl *t) { return __ip6gre_bucket(ign, &t->parms); } static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t); rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; for (tp = ip6gre_bucket(ign, t); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static struct ip6_tnl *ip6gre_tunnel_find(struct net *net, const struct __ip6_tnl_parm *parms, int type) { const struct in6_addr *remote = &parms->raddr; const struct in6_addr *local = &parms->laddr; __be32 key = parms->i_key; int link = parms->link; struct ip6_tnl *t; struct ip6_tnl __rcu **tp; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); for (tp = __ip6gre_bucket(ign, parms); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr) && key == t->parms.i_key && link == t->parms.link && type == t->dev->type) break; return t; } static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, const struct __ip6_tnl_parm *parms, int create) { struct ip6_tnl *t, *nt; struct net_device *dev; char name[IFNAMSIZ]; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); t = ip6gre_tunnel_find(net, parms, ARPHRD_IP6GRE); if (t && create) return NULL; if (t || !create) return t; if (parms->name[0]) { if (!dev_valid_name(parms->name)) return NULL; strscpy(name, parms->name, IFNAMSIZ); } else { strcpy(name, "ip6gre%d"); } dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6gre_tunnel_setup); if (!dev) return NULL; dev_net_set(dev, net); nt = netdev_priv(dev); nt->parms = *parms; dev->rtnl_link_ops = &ip6gre_link_ops; nt->dev = dev; nt->net = dev_net(dev); if (register_netdevice(dev) < 0) goto failed_free; ip6gre_tnl_link_config(nt, 1); ip6gre_tunnel_link(ign, nt); return nt; failed_free: free_netdev(dev); return NULL; } static void ip6erspan_tunnel_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); ip6erspan_tunnel_unlink_md(ign, t); ip6gre_tunnel_unlink(ign, t); dst_cache_reset(&t->dst_cache); netdev_put(dev, &t->dev_tracker); } static void ip6gre_tunnel_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); ip6gre_tunnel_unlink_md(ign, t); ip6gre_tunnel_unlink(ign, t); if (ign->fb_tunnel_dev == dev) WRITE_ONCE(ign->fb_tunnel_dev, NULL); dst_cache_reset(&t->dst_cache); netdev_put(dev, &t->dev_tracker); } static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); const struct ipv6hdr *ipv6h; struct tnl_ptk_info tpi; struct ip6_tnl *t; if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6), offset) < 0) return -EINVAL; ipv6h = (const struct ipv6hdr *)skb->data; t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr, tpi.key, tpi.proto); if (!t) return -ENOENT; switch (type) { case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); if (code != ICMPV6_PORT_UNREACH) break; return 0; case ICMPV6_TIME_EXCEED: if (code == ICMPV6_EXC_HOPLIMIT) { net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", t->parms.name); break; } return 0; case ICMPV6_PARAMPROB: { struct ipv6_tlv_tnl_enc_lim *tel; __u32 teli; teli = 0; if (code == ICMPV6_HDR_FIELD) teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); if (teli && teli == be32_to_cpu(info) - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", t->parms.name); } } else { net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", t->parms.name); } return 0; } case ICMPV6_PKT_TOOBIG: ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); return 0; case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); return 0; } if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; return 0; } static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) { const struct ipv6hdr *ipv6h; struct ip6_tnl *tunnel; ipv6h = ipv6_hdr(skb); tunnel = ip6gre_tunnel_lookup(skb->dev, &ipv6h->saddr, &ipv6h->daddr, tpi->key, tpi->proto); if (tunnel) { if (tunnel->parms.collect_md) { IP_TUNNEL_DECLARE_FLAGS(flags); struct metadata_dst *tun_dst; __be64 tun_id; ip_tunnel_flags_copy(flags, tpi->flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0); if (!tun_dst) return PACKET_REJECT; ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); } else { ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); } return PACKET_RCVD; } return PACKET_REJECT; } static int ip6erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, int gre_hdr_len) { struct erspan_base_hdr *ershdr; const struct ipv6hdr *ipv6h; struct erspan_md2 *md2; struct ip6_tnl *tunnel; u8 ver; if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr)))) return PACKET_REJECT; ipv6h = ipv6_hdr(skb); ershdr = (struct erspan_base_hdr *)skb->data; ver = ershdr->ver; tunnel = ip6gre_tunnel_lookup(skb->dev, &ipv6h->saddr, &ipv6h->daddr, tpi->key, tpi->proto); if (tunnel) { int len = erspan_hdr_len(ver); if (unlikely(!pskb_may_pull(skb, len))) return PACKET_REJECT; if (__iptunnel_pull_header(skb, len, htons(ETH_P_TEB), false, false) < 0) return PACKET_REJECT; if (tunnel->parms.collect_md) { struct erspan_metadata *pkt_md, *md; IP_TUNNEL_DECLARE_FLAGS(flags); struct metadata_dst *tun_dst; struct ip_tunnel_info *info; unsigned char *gh; __be64 tun_id; __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags); ip_tunnel_flags_copy(flags, tpi->flags); tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, sizeof(*md)); if (!tun_dst) return PACKET_REJECT; /* skb can be uncloned in __iptunnel_pull_header, so * old pkt_md is no longer valid and we need to reset * it */ gh = skb_network_header(skb) + skb_network_header_len(skb); pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len + sizeof(*ershdr)); info = &tun_dst->u.tun_info; md = ip_tunnel_info_opts(info); md->version = ver; md2 = &md->u.md2; memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags); info->options_len = sizeof(*md); ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); } else { ip6_tnl_rcv(tunnel, skb, tpi, NULL, log_ecn_error); } return PACKET_RCVD; } return PACKET_REJECT; } static int gre_rcv(struct sk_buff *skb) { struct tnl_ptk_info tpi; bool csum_err = false; int hdr_len; hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IPV6), 0); if (hdr_len < 0) goto drop; if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false)) goto drop; if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || tpi.proto == htons(ETH_P_ERSPAN2))) { if (ip6erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; goto out; } if (ip6gre_rcv(skb, &tpi) == PACKET_RCVD) return 0; out: icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } static int gre_handle_offloads(struct sk_buff *skb, bool csum) { return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } static void prepare_ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev, struct flowi6 *fl6, __u8 *dsfield, int *encap_limit) { const struct iphdr *iph = ip_hdr(skb); struct ip6_tnl *t = netdev_priv(dev); if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) *encap_limit = t->parms.encap_limit; memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6)); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) *dsfield = ipv4_get_dsfield(iph); else *dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6->flowi6_mark = skb->mark; else fl6->flowi6_mark = t->parms.fwmark; fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); } static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev, struct flowi6 *fl6, __u8 *dsfield, int *encap_limit) { struct ipv6hdr *ipv6h; struct ip6_tnl *t = netdev_priv(dev); __u16 offset; offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ ipv6h = ipv6_hdr(skb); if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel; tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; if (tel->encap_limit == 0) { icmpv6_ndo_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offset + 2); return -1; } *encap_limit = tel->encap_limit - 1; } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) { *encap_limit = t->parms.encap_limit; } memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6)); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) *dsfield = ipv6_get_dsfield(ipv6h); else *dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) fl6->flowlabel |= ip6_flowlabel(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6->flowi6_mark = skb->mark; else fl6->flowi6_mark = t->parms.fwmark; fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); return 0; } static int prepare_ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev, struct flowi6 *fl6, __u8 *dsfield, int *encap_limit) { struct ip6_tnl *t = netdev_priv(dev); if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) *encap_limit = t->parms.encap_limit; memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6)); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) *dsfield = 0; else *dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6->flowi6_mark = skb->mark; else fl6->flowi6_mark = t->parms.fwmark; fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); return 0; } static struct ip_tunnel_info *skb_tunnel_info_txcheck(struct sk_buff *skb) { struct ip_tunnel_info *tun_info; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX))) return ERR_PTR(-EINVAL); return tun_info; } static netdev_tx_t __gre6_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, __u32 *pmtu, __be16 proto) { struct ip6_tnl *tunnel = netdev_priv(dev); IP_TUNNEL_DECLARE_FLAGS(flags); __be16 protocol; if (dev->type == ARPHRD_ETHER) IPCB(skb)->flags = 0; if (dev->header_ops && dev->type == ARPHRD_IP6GRE) fl6->daddr = ((struct ipv6hdr *)skb->data)->daddr; else fl6->daddr = tunnel->parms.raddr; /* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; if (tunnel->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; int tun_hlen; tun_info = skb_tunnel_info_txcheck(skb); if (IS_ERR(tun_info) || unlikely(ip_tunnel_info_af(tun_info) != AF_INET6)) return -EINVAL; key = &tun_info->key; memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = IPPROTO_GRE; fl6->daddr = key->u.ipv6.dst; fl6->flowlabel = key->label; fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); fl6->fl6_gre_key = tunnel_id_to_key32(key->tun_id); dsfield = key->tos; ip_tunnel_flags_zero(flags); __set_bit(IP_TUNNEL_CSUM_BIT, flags); __set_bit(IP_TUNNEL_KEY_BIT, flags); __set_bit(IP_TUNNEL_SEQ_BIT, flags); ip_tunnel_flags_and(flags, flags, key->tun_flags); tun_hlen = gre_calc_hlen(flags); if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen)) return -ENOMEM; gre_build_header(skb, tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), test_bit(IP_TUNNEL_SEQ_BIT, flags) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); } else { if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) return -ENOMEM; ip_tunnel_flags_copy(flags, tunnel->parms.o_flags); gre_build_header(skb, tunnel->tun_hlen, flags, protocol, tunnel->parms.o_key, test_bit(IP_TUNNEL_SEQ_BIT, flags) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0); } return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, NEXTHDR_GRE); } static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int encap_limit = -1; struct flowi6 fl6; __u8 dsfield = 0; __u32 mtu; int err; memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); if (!t->parms.collect_md) prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, &dsfield, &encap_limit); err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, t->parms.o_flags)); if (err) return -1; err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, skb->protocol); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); return -1; } return 0; } static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h = ipv6_hdr(skb); int encap_limit = -1; struct flowi6 fl6; __u8 dsfield = 0; __u32 mtu; int err; if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) return -1; if (!t->parms.collect_md && prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) return -1; if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, t->parms.o_flags))) return -1; err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, skb->protocol); if (err != 0) { if (err == -EMSGSIZE) icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); return -1; } return 0; } static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int encap_limit = -1; struct flowi6 fl6; __u8 dsfield = 0; __u32 mtu; int err; if (!t->parms.collect_md && prepare_ip6gre_xmit_other(skb, dev, &fl6, &dsfield, &encap_limit)) return -1; err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT, t->parms.o_flags)); if (err) return err; err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, skb->protocol); return err; } static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); __be16 payload_protocol; int ret; if (!pskb_inet_may_pull(skb)) goto tx_err; if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) goto tx_err; payload_protocol = skb_protocol(skb, true); switch (payload_protocol) { case htons(ETH_P_IP): ret = ip6gre_xmit_ipv4(skb, dev); break; case htons(ETH_P_IPV6): ret = ip6gre_xmit_ipv6(skb, dev); break; default: ret = ip6gre_xmit_other(skb, dev); break; } if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: if (!t->parms.collect_md || !IS_ERR(skb_tunnel_info_txcheck(skb))) DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel_info *tun_info = NULL; struct ip6_tnl *t = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; bool truncate = false; int encap_limit = -1; __u8 dsfield = false; struct flowi6 fl6; int err = -EINVAL; __be16 proto; __u32 mtu; int nhoff; if (!pskb_inet_may_pull(skb)) goto tx_err; if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) goto tx_err; if (gre_handle_offloads(skb, false)) goto tx_err; if (skb->len > dev->mtu + dev->hard_header_len) { if (pskb_trim(skb, dev->mtu + dev->hard_header_len)) goto tx_err; truncate = true; } nhoff = skb_network_offset(skb); if (skb->protocol == htons(ETH_P_IP) && (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff)) truncate = true; if (skb->protocol == htons(ETH_P_IPV6)) { int thoff; if (skb_transport_header_was_set(skb)) thoff = skb_transport_offset(skb); else thoff = nhoff + sizeof(struct ipv6hdr); if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff) truncate = true; } if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen)) goto tx_err; __clear_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags); IPCB(skb)->flags = 0; /* For collect_md mode, derive fl6 from the tunnel key, * for native mode, call prepare_ip6gre_xmit_{ipv4,ipv6}. */ if (t->parms.collect_md) { const struct ip_tunnel_key *key; struct erspan_metadata *md; __be32 tun_id; tun_info = skb_tunnel_info_txcheck(skb); if (IS_ERR(tun_info) || unlikely(ip_tunnel_info_af(tun_info) != AF_INET6)) goto tx_err; key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_GRE; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); fl6.fl6_gre_key = tunnel_id_to_key32(key->tun_id); dsfield = key->tos; if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags)) goto tx_err; if (tun_info->options_len < sizeof(*md)) goto tx_err; md = ip_tunnel_info_opts(tun_info); tun_id = tunnel_id_to_key32(key->tun_id); if (md->version == 1) { erspan_build_header(skb, ntohl(tun_id), ntohl(md->u.index), truncate, false); proto = htons(ETH_P_ERSPAN); } else if (md->version == 2) { erspan_build_header_v2(skb, ntohl(tun_id), md->u.md2.dir, get_hwid(&md->u.md2), truncate, false); proto = htons(ETH_P_ERSPAN2); } else { goto tx_err; } } else { switch (skb->protocol) { case htons(ETH_P_IP): memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, &dsfield, &encap_limit); break; case htons(ETH_P_IPV6): if (ipv6_addr_equal(&t->parms.raddr, &ipv6_hdr(skb)->saddr)) goto tx_err; if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) goto tx_err; break; default: memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); break; } if (t->parms.erspan_ver == 1) { erspan_build_header(skb, ntohl(t->parms.o_key), t->parms.index, truncate, false); proto = htons(ETH_P_ERSPAN); } else if (t->parms.erspan_ver == 2) { erspan_build_header_v2(skb, ntohl(t->parms.o_key), t->parms.dir, t->parms.hwid, truncate, false); proto = htons(ETH_P_ERSPAN2); } else { goto tx_err; } fl6.daddr = t->parms.raddr; } /* Push GRE header. */ __set_bit(IP_TUNNEL_SEQ_BIT, flags); gre_build_header(skb, 8, flags, proto, 0, htonl(atomic_fetch_inc(&t->o_seqno))); /* TooBig packet may have updated dst->dev's mtu */ if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, false); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, NEXTHDR_GRE); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) { if (skb->protocol == htons(ETH_P_IP)) icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); else icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); } goto tx_err; } return NETDEV_TX_OK; tx_err: if (!IS_ERR(tun_info)) DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static void ip6gre_tnl_link_config_common(struct ip6_tnl *t) { struct net_device *dev = t->dev; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; if (dev->type != ARPHRD_ETHER) { __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); } /* Set up flowi template */ fl6->saddr = p->laddr; fl6->daddr = p->raddr; fl6->flowi6_oif = p->link; fl6->flowlabel = 0; fl6->flowi6_proto = IPPROTO_GRE; fl6->fl6_gre_key = t->parms.o_key; if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET); p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV && dev->type != ARPHRD_ETHER) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; } static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu, int t_hlen) { const struct __ip6_tnl_parm *p = &t->parms; struct net_device *dev = t->dev; if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, NULL, strict); if (!rt) return; if (rt->dst.dev) { unsigned short dst_len = rt->dst.dev->hard_header_len + t_hlen; if (t->dev->header_ops) dev->hard_header_len = dst_len; else dev->needed_headroom = dst_len; if (set_mtu) { int mtu = rt->dst.dev->mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) mtu -= 8; if (dev->type == ARPHRD_ETHER) mtu -= ETH_HLEN; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; WRITE_ONCE(dev->mtu, mtu); } } ip6_rt_put(rt); } } static int ip6gre_calc_hlen(struct ip6_tnl *tunnel) { int t_hlen; tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); if (tunnel->dev->header_ops) tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen; else tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen; return t_hlen; } static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) { ip6gre_tnl_link_config_common(t); ip6gre_tnl_link_config_route(t, set_mtu, ip6gre_calc_hlen(t)); } static void ip6gre_tnl_copy_tnl_parm(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) { t->parms.laddr = p->laddr; t->parms.raddr = p->raddr; t->parms.flags = p->flags; t->parms.hop_limit = p->hop_limit; t->parms.encap_limit = p->encap_limit; t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags); ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags); t->parms.fwmark = p->fwmark; t->parms.erspan_ver = p->erspan_ver; t->parms.index = p->index; t->parms.dir = p->dir; t->parms.hwid = p->hwid; dst_cache_reset(&t->dst_cache); } static int ip6gre_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p, int set_mtu) { ip6gre_tnl_copy_tnl_parm(t, p); ip6gre_tnl_link_config(t, set_mtu); return 0; } static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm2 *u) { p->laddr = u->laddr; p->raddr = u->raddr; p->flags = u->flags; p->hop_limit = u->hop_limit; p->encap_limit = u->encap_limit; p->flowinfo = u->flowinfo; p->link = u->link; p->i_key = u->i_key; p->o_key = u->o_key; gre_flags_to_tnl_flags(p->i_flags, u->i_flags); gre_flags_to_tnl_flags(p->o_flags, u->o_flags); memcpy(p->name, u->name, sizeof(u->name)); } static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p) { u->proto = IPPROTO_GRE; u->laddr = p->laddr; u->raddr = p->raddr; u->flags = p->flags; u->hop_limit = p->hop_limit; u->encap_limit = p->encap_limit; u->flowinfo = p->flowinfo; u->link = p->link; u->i_key = p->i_key; u->o_key = p->o_key; u->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); u->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); memcpy(u->name, p->name, sizeof(u->name)); } static int ip6gre_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm2 p; struct __ip6_tnl_parm p1; struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); memset(&p1, 0, sizeof(p1)); switch (cmd) { case SIOCGETTUNNEL: if (dev == ign->fb_tunnel_dev) { if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } ip6gre_tnl_parm_from_user(&p1, &p); t = ip6gre_tunnel_locate(net, &p1, 0); if (!t) t = netdev_priv(dev); } memset(&p, 0, sizeof(p)); ip6gre_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) goto done; err = -EINVAL; if ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)) goto done; if (!(p.i_flags&GRE_KEY)) p.i_key = 0; if (!(p.o_flags&GRE_KEY)) p.o_key = 0; ip6gre_tnl_parm_from_user(&p1, &p); t = ip6gre_tunnel_locate(net, &p1, cmd == SIOCADDTUNNEL); if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { if (t) { if (t->dev != dev) { err = -EEXIST; break; } } else { t = netdev_priv(dev); ip6gre_tunnel_unlink(ign, t); synchronize_net(); ip6gre_tnl_change(t, &p1, 1); ip6gre_tunnel_link(ign, t); netdev_state_change(dev); } } if (t) { err = 0; memset(&p, 0, sizeof(p)); ip6gre_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); break; case SIOCDELTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; if (dev == ign->fb_tunnel_dev) { err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) goto done; err = -ENOENT; ip6gre_tnl_parm_from_user(&p1, &p); t = ip6gre_tunnel_locate(net, &p1, 0); if (!t) goto done; err = -EPERM; if (t == netdev_priv(ign->fb_tunnel_dev)) goto done; dev = t->dev; } unregister_netdevice(dev); err = 0; break; default: err = -EINVAL; } done: return err; } static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h; __be16 *p; ipv6h = skb_push(skb, t->hlen + sizeof(*ipv6h)); ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb, t->fl.u.ip6.flowlabel, true, &t->fl.u.ip6)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; ipv6h->daddr = t->parms.raddr; p = (__be16 *)(ipv6h + 1); p[0] = ip_tunnel_flags_to_be16(t->parms.o_flags); p[1] = htons(type); /* * Set the source hardware address. */ if (saddr) memcpy(&ipv6h->saddr, saddr, sizeof(struct in6_addr)); if (daddr) memcpy(&ipv6h->daddr, daddr, sizeof(struct in6_addr)); if (!ipv6_addr_any(&ipv6h->daddr)) return t->hlen; return -t->hlen; } static const struct header_ops ip6gre_header_ops = { .create = ip6gre_header, }; static const struct net_device_ops ip6gre_netdev_ops = { .ndo_init = ip6gre_tunnel_init, .ndo_uninit = ip6gre_tunnel_uninit, .ndo_start_xmit = ip6gre_tunnel_xmit, .ndo_siocdevprivate = ip6gre_tunnel_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_iflink = ip6_tnl_get_iflink, }; static void ip6gre_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); } static void ip6gre_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ip6gre_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->type = ARPHRD_IP6GRE; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); netif_keep_dst(dev); /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); } #define GRE6_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_HW_CSUM) static void ip6gre_tnl_init_features(struct net_device *dev) { struct ip6_tnl *nt = netdev_priv(dev); dev->features |= GRE6_FEATURES; dev->hw_features |= GRE6_FEATURES; /* TCP offload with GRE SEQ is not supported, nor can we support 2 * levels of outer headers requiring an update. */ if (test_bit(IP_TUNNEL_SEQ_BIT, nt->parms.o_flags)) return; if (test_bit(IP_TUNNEL_CSUM_BIT, nt->parms.o_flags) && nt->encap.type != TUNNEL_ENCAP_NONE) return; dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->lltx = true; } static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; int ret; int t_hlen; tunnel = netdev_priv(dev); tunnel->dev = dev; tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) return ret; ret = gro_cells_init(&tunnel->gro_cells, dev); if (ret) goto cleanup_dst_cache_init; t_hlen = ip6gre_calc_hlen(tunnel); dev->mtu = ETH_DATA_LEN - t_hlen; if (dev->type == ARPHRD_ETHER) dev->mtu -= ETH_HLEN; if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; if (tunnel->parms.collect_md) { netif_keep_dst(dev); } ip6gre_tnl_init_features(dev); netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; cleanup_dst_cache_init: dst_cache_destroy(&tunnel->dst_cache); return ret; } static int ip6gre_tunnel_init(struct net_device *dev) { struct ip6_tnl *tunnel; int ret; ret = ip6gre_tunnel_init_common(dev); if (ret) return ret; tunnel = netdev_priv(dev); if (tunnel->parms.collect_md) return 0; __dev_addr_set(dev, &tunnel->parms.laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr)); if (ipv6_addr_any(&tunnel->parms.raddr)) dev->header_ops = &ip6gre_header_ops; return 0; } static void ip6gre_fb_tunnel_init(struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); tunnel->dev = dev; tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); tunnel->hlen = sizeof(struct ipv6hdr) + 4; } static struct inet6_protocol ip6gre_protocol __read_mostly = { .handler = gre_rcv, .err_handler = ip6gre_err, .flags = INET6_PROTO_FINAL, }; static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) { struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); struct net_device *dev, *aux; int prio; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6gre_link_ops || dev->rtnl_link_ops == &ip6gre_tap_ops || dev->rtnl_link_ops == &ip6erspan_tap_ops) unregister_netdevice_queue(dev, head); for (prio = 0; prio < 4; prio++) { int h; for (h = 0; h < IP6_GRE_HASH_SIZE; h++) { struct ip6_tnl *t; t = rtnl_dereference(ign->tunnels[prio][h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, head); t = rtnl_dereference(t->next); } } } } static int __net_init ip6gre_init_net(struct net *net) { struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); struct net_device *ndev; int err; if (!net_has_fallback_tunnels(net)) return 0; ndev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0", NET_NAME_UNKNOWN, ip6gre_tunnel_setup); if (!ndev) { err = -ENOMEM; goto err_alloc_dev; } ign->fb_tunnel_dev = ndev; dev_net_set(ign->fb_tunnel_dev, net); /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ ign->fb_tunnel_dev->netns_local = true; ip6gre_fb_tunnel_init(ign->fb_tunnel_dev); ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops; err = register_netdev(ign->fb_tunnel_dev); if (err) goto err_reg_dev; rcu_assign_pointer(ign->tunnels_wc[0], netdev_priv(ign->fb_tunnel_dev)); return 0; err_reg_dev: free_netdev(ndev); err_alloc_dev: return err; } static void __net_exit ip6gre_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) ip6gre_destroy_tunnels(net, dev_to_kill); } static struct pernet_operations ip6gre_net_ops = { .init = ip6gre_init_net, .exit_batch_rtnl = ip6gre_exit_batch_rtnl, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), }; static int ip6gre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { __be16 flags; if (!data) return 0; flags = 0; if (data[IFLA_GRE_IFLAGS]) flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); if (data[IFLA_GRE_OFLAGS]) flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); if (flags & (GRE_VERSION|GRE_ROUTING)) return -EINVAL; return 0; } static int ip6gre_tap_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct in6_addr daddr; if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } if (!data) goto out; if (data[IFLA_GRE_REMOTE]) { daddr = nla_get_in6_addr(data[IFLA_GRE_REMOTE]); if (ipv6_addr_any(&daddr)) return -EINVAL; } out: return ip6gre_tunnel_validate(tb, data, extack); } static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { __be16 flags = 0; int ret, ver = 0; if (!data) return 0; ret = ip6gre_tap_validate(tb, data, extack); if (ret) return ret; /* ERSPAN should only have GRE sequence and key flag */ if (data[IFLA_GRE_OFLAGS]) flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); if (data[IFLA_GRE_IFLAGS]) flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); if (!data[IFLA_GRE_COLLECT_METADATA] && flags != (GRE_SEQ | GRE_KEY)) return -EINVAL; /* ERSPAN Session ID only has 10-bit. Since we reuse * 32-bit key field as ID, check it's range. */ if (data[IFLA_GRE_IKEY] && (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK)) return -EINVAL; if (data[IFLA_GRE_OKEY] && (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK)) return -EINVAL; if (data[IFLA_GRE_ERSPAN_VER]) { ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); if (ver != 1 && ver != 2) return -EINVAL; } if (ver == 1) { if (data[IFLA_GRE_ERSPAN_INDEX]) { u32 index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); if (index & ~INDEX_MASK) return -EINVAL; } } else if (ver == 2) { if (data[IFLA_GRE_ERSPAN_DIR]) { u16 dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); if (dir & ~(DIR_MASK >> DIR_OFFSET)) return -EINVAL; } if (data[IFLA_GRE_ERSPAN_HWID]) { u16 hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); if (hwid & ~(HWID_MASK >> HWID_OFFSET)) return -EINVAL; } } return 0; } static void ip6erspan_set_version(struct nlattr *data[], struct __ip6_tnl_parm *parms) { if (!data) return; parms->erspan_ver = 1; if (data[IFLA_GRE_ERSPAN_VER]) parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); if (parms->erspan_ver == 1) { if (data[IFLA_GRE_ERSPAN_INDEX]) parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); } else if (parms->erspan_ver == 2) { if (data[IFLA_GRE_ERSPAN_DIR]) parms->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); if (data[IFLA_GRE_ERSPAN_HWID]) parms->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); } } static void ip6gre_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_GRE_LINK]) parms->link = nla_get_u32(data[IFLA_GRE_LINK]); if (data[IFLA_GRE_IFLAGS]) gre_flags_to_tnl_flags(parms->i_flags, nla_get_be16(data[IFLA_GRE_IFLAGS])); if (data[IFLA_GRE_OFLAGS]) gre_flags_to_tnl_flags(parms->o_flags, nla_get_be16(data[IFLA_GRE_OFLAGS])); if (data[IFLA_GRE_IKEY]) parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); if (data[IFLA_GRE_OKEY]) parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); if (data[IFLA_GRE_LOCAL]) parms->laddr = nla_get_in6_addr(data[IFLA_GRE_LOCAL]); if (data[IFLA_GRE_REMOTE]) parms->raddr = nla_get_in6_addr(data[IFLA_GRE_REMOTE]); if (data[IFLA_GRE_TTL]) parms->hop_limit = nla_get_u8(data[IFLA_GRE_TTL]); if (data[IFLA_GRE_ENCAP_LIMIT]) parms->encap_limit = nla_get_u8(data[IFLA_GRE_ENCAP_LIMIT]); if (data[IFLA_GRE_FLOWINFO]) parms->flowinfo = nla_get_be32(data[IFLA_GRE_FLOWINFO]); if (data[IFLA_GRE_FLAGS]) parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]); if (data[IFLA_GRE_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); if (data[IFLA_GRE_COLLECT_METADATA]) parms->collect_md = true; } static int ip6gre_tap_init(struct net_device *dev) { int ret; ret = ip6gre_tunnel_init_common(dev); if (ret) return ret; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; return 0; } static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_init = ip6gre_tap_init, .ndo_uninit = ip6gre_tunnel_uninit, .ndo_start_xmit = ip6gre_tunnel_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_iflink = ip6_tnl_get_iflink, }; static int ip6erspan_calc_hlen(struct ip6_tnl *tunnel) { int t_hlen; tunnel->tun_hlen = 8; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + erspan_hdr_len(tunnel->parms.erspan_ver); t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen; return t_hlen; } static int ip6erspan_tap_init(struct net_device *dev) { struct ip6_tnl *tunnel; int t_hlen; int ret; tunnel = netdev_priv(dev); tunnel->dev = dev; tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) return ret; ret = gro_cells_init(&tunnel->gro_cells, dev); if (ret) goto cleanup_dst_cache_init; t_hlen = ip6erspan_calc_hlen(tunnel); dev->mtu = ETH_DATA_LEN - t_hlen; if (dev->type == ARPHRD_ETHER) dev->mtu -= ETH_HLEN; if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; ip6erspan_tnl_link_config(tunnel, 1); netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; cleanup_dst_cache_init: dst_cache_destroy(&tunnel->dst_cache); return ret; } static const struct net_device_ops ip6erspan_netdev_ops = { .ndo_init = ip6erspan_tap_init, .ndo_uninit = ip6erspan_tunnel_uninit, .ndo_start_xmit = ip6erspan_tunnel_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_iflink = ip6_tnl_get_iflink, }; static void ip6gre_tap_setup(struct net_device *dev) { ether_setup(dev); dev->max_mtu = 0; dev->netdev_ops = &ip6gre_tap_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); } static bool ip6gre_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *ipencap) { bool ret = false; memset(ipencap, 0, sizeof(*ipencap)); if (!data) return ret; if (data[IFLA_GRE_ENCAP_TYPE]) { ret = true; ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]); } if (data[IFLA_GRE_ENCAP_FLAGS]) { ret = true; ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]); } if (data[IFLA_GRE_ENCAP_SPORT]) { ret = true; ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]); } if (data[IFLA_GRE_ENCAP_DPORT]) { ret = true; ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]); } return ret; } static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6_tnl *nt; struct ip_tunnel_encap ipencap; int err; nt = netdev_priv(dev); if (ip6gre_netlink_encap_parms(data, &ipencap)) { int err = ip6_tnl_encap_setup(nt, &ipencap); if (err < 0) return err; } if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); nt->dev = dev; nt->net = dev_net(dev); err = register_netdevice(dev); if (err) goto out; if (tb[IFLA_MTU]) ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); out: return err; } static int ip6gre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6_tnl *nt = netdev_priv(dev); struct net *net = dev_net(dev); struct ip6gre_net *ign; int err; ip6gre_netlink_parms(data, &nt->parms); ign = net_generic(net, ip6gre_net_id); if (nt->parms.collect_md) { if (rtnl_dereference(ign->collect_md_tun)) return -EEXIST; } else { if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) return -EEXIST; } err = ip6gre_newlink_common(src_net, dev, tb, data, extack); if (!err) { ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); ip6gre_tunnel_link_md(ign, nt); ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt); } return err; } static struct ip6_tnl * ip6gre_changelink_common(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct __ip6_tnl_parm *p_p, struct netlink_ext_ack *extack) { struct ip6_tnl *t, *nt = netdev_priv(dev); struct net *net = nt->net; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); struct ip_tunnel_encap ipencap; if (dev == ign->fb_tunnel_dev) return ERR_PTR(-EINVAL); if (ip6gre_netlink_encap_parms(data, &ipencap)) { int err = ip6_tnl_encap_setup(nt, &ipencap); if (err < 0) return ERR_PTR(err); } ip6gre_netlink_parms(data, p_p); t = ip6gre_tunnel_locate(net, p_p, 0); if (t) { if (t->dev != dev) return ERR_PTR(-EEXIST); } else { t = nt; } return t; } static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6_tnl *t = netdev_priv(dev); struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); struct __ip6_tnl_parm p; t = ip6gre_changelink_common(dev, tb, data, &p, extack); if (IS_ERR(t)) return PTR_ERR(t); ip6gre_tunnel_unlink_md(ign, t); ip6gre_tunnel_unlink(ign, t); ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]); ip6gre_tunnel_link_md(ign, t); ip6gre_tunnel_link(ign, t); return 0; } static void ip6gre_dellink(struct net_device *dev, struct list_head *head) { struct net *net = dev_net(dev); struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); if (dev != ign->fb_tunnel_dev) unregister_netdevice_queue(dev, head); } static size_t ip6gre_get_size(const struct net_device *dev) { return /* IFLA_GRE_LINK */ nla_total_size(4) + /* IFLA_GRE_IFLAGS */ nla_total_size(2) + /* IFLA_GRE_OFLAGS */ nla_total_size(2) + /* IFLA_GRE_IKEY */ nla_total_size(4) + /* IFLA_GRE_OKEY */ nla_total_size(4) + /* IFLA_GRE_LOCAL */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GRE_REMOTE */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GRE_TTL */ nla_total_size(1) + /* IFLA_GRE_ENCAP_LIMIT */ nla_total_size(1) + /* IFLA_GRE_FLOWINFO */ nla_total_size(4) + /* IFLA_GRE_FLAGS */ nla_total_size(4) + /* IFLA_GRE_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_GRE_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_GRE_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_GRE_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_GRE_COLLECT_METADATA */ nla_total_size(0) + /* IFLA_GRE_FWMARK */ nla_total_size(4) + /* IFLA_GRE_ERSPAN_INDEX */ nla_total_size(4) + 0; } static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm *p = &t->parms; IP_TUNNEL_DECLARE_FLAGS(o_flags); ip_tunnel_flags_copy(o_flags, p->o_flags); if (p->erspan_ver == 1 || p->erspan_ver == 2) { if (!p->collect_md) __set_bit(IP_TUNNEL_KEY_BIT, o_flags); if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver)) goto nla_put_failure; if (p->erspan_ver == 1) { if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) goto nla_put_failure; } else { if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid)) goto nla_put_failure; } } if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || nla_put_be16(skb, IFLA_GRE_IFLAGS, gre_tnl_flags_to_gre_flags(p->i_flags)) || nla_put_be16(skb, IFLA_GRE_OFLAGS, gre_tnl_flags_to_gre_flags(o_flags)) || nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || nla_put_in6_addr(skb, IFLA_GRE_LOCAL, &p->laddr) || nla_put_in6_addr(skb, IFLA_GRE_REMOTE, &p->raddr) || nla_put_u8(skb, IFLA_GRE_TTL, p->hop_limit) || nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) || nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) || nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) || nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, t->encap.type) || nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT, t->encap.sport) || nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT, t->encap.dport) || nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, t->encap.flags)) goto nla_put_failure; if (p->collect_md) { if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_LINK] = { .type = NLA_U32 }, [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, [IFLA_GRE_IKEY] = { .type = NLA_U32 }, [IFLA_GRE_OKEY] = { .type = NLA_U32 }, [IFLA_GRE_LOCAL] = { .len = sizeof_field(struct ipv6hdr, saddr) }, [IFLA_GRE_REMOTE] = { .len = sizeof_field(struct ipv6hdr, daddr) }, [IFLA_GRE_TTL] = { .type = NLA_U8 }, [IFLA_GRE_ENCAP_LIMIT] = { .type = NLA_U8 }, [IFLA_GRE_FLOWINFO] = { .type = NLA_U32 }, [IFLA_GRE_FLAGS] = { .type = NLA_U32 }, [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 }, [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 }, [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 }, }; static void ip6erspan_tap_setup(struct net_device *dev) { ether_setup(dev); dev->max_mtu = 0; dev->netdev_ops = &ip6erspan_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6gre_dev_free; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netif_keep_dst(dev); } static int ip6erspan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6_tnl *nt = netdev_priv(dev); struct net *net = dev_net(dev); struct ip6gre_net *ign; int err; ip6gre_netlink_parms(data, &nt->parms); ip6erspan_set_version(data, &nt->parms); ign = net_generic(net, ip6gre_net_id); if (nt->parms.collect_md) { if (rtnl_dereference(ign->collect_md_tun_erspan)) return -EEXIST; } else { if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) return -EEXIST; } err = ip6gre_newlink_common(src_net, dev, tb, data, extack); if (!err) { ip6erspan_tnl_link_config(nt, !tb[IFLA_MTU]); ip6erspan_tunnel_link_md(ign, nt); ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt); } return err; } static void ip6erspan_tnl_link_config(struct ip6_tnl *t, int set_mtu) { ip6gre_tnl_link_config_common(t); ip6gre_tnl_link_config_route(t, set_mtu, ip6erspan_calc_hlen(t)); } static int ip6erspan_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p, int set_mtu) { ip6gre_tnl_copy_tnl_parm(t, p); ip6erspan_tnl_link_config(t, set_mtu); return 0; } static int ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id); struct __ip6_tnl_parm p; struct ip6_tnl *t; t = ip6gre_changelink_common(dev, tb, data, &p, extack); if (IS_ERR(t)) return PTR_ERR(t); ip6erspan_set_version(data, &p); ip6gre_tunnel_unlink_md(ign, t); ip6gre_tunnel_unlink(ign, t); ip6erspan_tnl_change(t, &p, !tb[IFLA_MTU]); ip6erspan_tunnel_link_md(ign, t); ip6gre_tunnel_link(ign, t); return 0; } static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { .kind = "ip6gre", .maxtype = IFLA_GRE_MAX, .policy = ip6gre_policy, .priv_size = sizeof(struct ip6_tnl), .setup = ip6gre_tunnel_setup, .validate = ip6gre_tunnel_validate, .newlink = ip6gre_newlink, .changelink = ip6gre_changelink, .dellink = ip6gre_dellink, .get_size = ip6gre_get_size, .fill_info = ip6gre_fill_info, .get_link_net = ip6_tnl_get_link_net, }; static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = { .kind = "ip6gretap", .maxtype = IFLA_GRE_MAX, .policy = ip6gre_policy, .priv_size = sizeof(struct ip6_tnl), .setup = ip6gre_tap_setup, .validate = ip6gre_tap_validate, .newlink = ip6gre_newlink, .changelink = ip6gre_changelink, .get_size = ip6gre_get_size, .fill_info = ip6gre_fill_info, .get_link_net = ip6_tnl_get_link_net, }; static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = { .kind = "ip6erspan", .maxtype = IFLA_GRE_MAX, .policy = ip6gre_policy, .priv_size = sizeof(struct ip6_tnl), .setup = ip6erspan_tap_setup, .validate = ip6erspan_tap_validate, .newlink = ip6erspan_newlink, .changelink = ip6erspan_changelink, .get_size = ip6gre_get_size, .fill_info = ip6gre_fill_info, .get_link_net = ip6_tnl_get_link_net, }; /* * And now the modules code and kernel interface. */ static int __init ip6gre_init(void) { int err; pr_info("GRE over IPv6 tunneling driver\n"); err = register_pernet_device(&ip6gre_net_ops); if (err < 0) return err; err = inet6_add_protocol(&ip6gre_protocol, IPPROTO_GRE); if (err < 0) { pr_info("%s: can't add protocol\n", __func__); goto add_proto_failed; } err = rtnl_link_register(&ip6gre_link_ops); if (err < 0) goto rtnl_link_failed; err = rtnl_link_register(&ip6gre_tap_ops); if (err < 0) goto tap_ops_failed; err = rtnl_link_register(&ip6erspan_tap_ops); if (err < 0) goto erspan_link_failed; out: return err; erspan_link_failed: rtnl_link_unregister(&ip6gre_tap_ops); tap_ops_failed: rtnl_link_unregister(&ip6gre_link_ops); rtnl_link_failed: inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE); add_proto_failed: unregister_pernet_device(&ip6gre_net_ops); goto out; } static void __exit ip6gre_fini(void) { rtnl_link_unregister(&ip6gre_tap_ops); rtnl_link_unregister(&ip6gre_link_ops); rtnl_link_unregister(&ip6erspan_tap_ops); inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE); unregister_pernet_device(&ip6gre_net_ops); } module_init(ip6gre_init); module_exit(ip6gre_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("D. Kozlov <xeb@mail.ru>"); MODULE_DESCRIPTION("GRE over IPv6 tunneling device"); MODULE_ALIAS_RTNL_LINK("ip6gre"); MODULE_ALIAS_RTNL_LINK("ip6gretap"); MODULE_ALIAS_RTNL_LINK("ip6erspan"); MODULE_ALIAS_NETDEV("ip6gre0"); |
| 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 | /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 */ /* * Handle hardware traps and faults. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/context_tracking.h> #include <linux/interrupt.h> #include <linux/kallsyms.h> #include <linux/kmsan.h> #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/kdebug.h> #include <linux/kgdb.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/ptrace.h> #include <linux/uprobes.h> #include <linux/string.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/kexec.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/timer.h> #include <linux/init.h> #include <linux/bug.h> #include <linux/nmi.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/io.h> #include <linux/hardirq.h> #include <linux/atomic.h> #include <linux/iommu.h> #include <linux/ubsan.h> #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> #include <asm/realmode.h> #include <asm/text-patching.h> #include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> #include <asm/fred.h> #include <asm/fpu/api.h> #include <asm/cpu.h> #include <asm/cpu_entry_area.h> #include <asm/mce.h> #include <asm/fixmap.h> #include <asm/mach_traps.h> #include <asm/alternative.h> #include <asm/fpu/xstate.h> #include <asm/vm86.h> #include <asm/umip.h> #include <asm/insn.h> #include <asm/insn-eval.h> #include <asm/vdso.h> #include <asm/tdx.h> #include <asm/cfi.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> #else #include <asm/processor-flags.h> #include <asm/setup.h> #endif #include <asm/proto.h> DECLARE_BITMAP(system_vectors, NR_VECTORS); __always_inline int is_valid_bugaddr(unsigned long addr) { if (addr < TASK_SIZE_MAX) return 0; /* * We got #UD, if the text isn't readable we'd have gotten * a different exception. */ return *(unsigned short *)addr == INSN_UD2; } /* * Check for UD1 or UD2, accounting for Address Size Override Prefixes. * If it's a UD1, get the ModRM byte to pass along to UBSan. */ __always_inline int decode_bug(unsigned long addr, u32 *imm) { u8 v; if (addr < TASK_SIZE_MAX) return BUG_NONE; v = *(u8 *)(addr++); if (v == INSN_ASOP) v = *(u8 *)(addr++); if (v != OPCODE_ESCAPE) return BUG_NONE; v = *(u8 *)(addr++); if (v == SECOND_BYTE_OPCODE_UD2) return BUG_UD2; if (!IS_ENABLED(CONFIG_UBSAN_TRAP) || v != SECOND_BYTE_OPCODE_UD1) return BUG_NONE; /* Retrieve the immediate (type value) for the UBSAN UD1 */ v = *(u8 *)(addr++); if (X86_MODRM_RM(v) == 4) addr++; *imm = 0; if (X86_MODRM_MOD(v) == 1) *imm = *(u8 *)addr; else if (X86_MODRM_MOD(v) == 2) *imm = *(u32 *)addr; else WARN_ONCE(1, "Unexpected MODRM_MOD: %u\n", X86_MODRM_MOD(v)); return BUG_UD1; } static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, struct pt_regs *regs, long error_code) { if (v8086_mode(regs)) { /* * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. * On nmi (interrupt 2), do_trap should not be called. */ if (trapnr < X86_TRAP_UD) { if (!handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr)) return 0; } } else if (!user_mode(regs)) { if (fixup_exception(regs, trapnr, error_code, 0)) return 0; tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; die(str, regs, error_code); } else { if (fixup_vdso_exception(regs, trapnr, error_code, 0)) return 0; } /* * We want error_code and trap_nr set for userspace faults and * kernelspace faults which result in die(), but not * kernelspace faults which are fixed up. die() gives the * process no chance to handle the signal and notice the * kernel fault information, so that won't result in polluting * the information about previously queued, but not yet * delivered, faults. See also exc_general_protection below. */ tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; return -1; } static void show_signal(struct task_struct *tsk, int signr, const char *type, const char *desc, struct pt_regs *regs, long error_code) { if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { pr_info("%s[%d] %s%s ip:%lx sp:%lx error:%lx", tsk->comm, task_pid_nr(tsk), type, desc, regs->ip, regs->sp, error_code); print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } } static void do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, int sicode, void __user *addr) { struct task_struct *tsk = current; if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) return; show_signal(tsk, signr, "trap ", str, regs, error_code); if (!sicode) force_sig(signr); else force_sig_fault(signr, sicode, addr); } NOKPROBE_SYMBOL(do_trap); static void do_error_trap(struct pt_regs *regs, long error_code, char *str, unsigned long trapnr, int signr, int sicode, void __user *addr) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { cond_local_irq_enable(regs); do_trap(trapnr, signr, str, regs, error_code, sicode, addr); cond_local_irq_disable(regs); } } /* * Posix requires to provide the address of the faulting instruction for * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t. * * This address is usually regs->ip, but when an uprobe moved the code out * of line then regs->ip points to the XOL code which would confuse * anything which analyzes the fault address vs. the unmodified binary. If * a trap happened in XOL code then uprobe maps regs->ip back to the * original instruction address. */ static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs) { return (void __user *)uprobe_get_trap_addr(regs); } DEFINE_IDTENTRY(exc_divide_error) { do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE, FPE_INTDIV, error_get_trap_addr(regs)); } DEFINE_IDTENTRY(exc_overflow) { do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); } #ifdef CONFIG_X86_F00F_BUG void handle_invalid_op(struct pt_regs *regs) #else static inline void handle_invalid_op(struct pt_regs *regs) #endif { do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL, ILL_ILLOPN, error_get_trap_addr(regs)); } static noinstr bool handle_bug(struct pt_regs *regs) { bool handled = false; int ud_type; u32 imm; ud_type = decode_bug(regs->ip, &imm); if (ud_type == BUG_NONE) return handled; /* * All lies, just get the WARN/BUG out. */ instrumentation_begin(); /* * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() * is a rare case that uses @regs without passing them to * irqentry_enter(). */ kmsan_unpoison_entry_regs(regs); /* * Since we're emulating a CALL with exceptions, restore the interrupt * state to what it was at the exception site. */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); if (ud_type == BUG_UD2) { if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { regs->ip += LEN_UD2; handled = true; } } else if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { pr_crit("%s at %pS\n", report_ubsan_failure(regs, imm), (void *)regs->ip); } if (regs->flags & X86_EFLAGS_IF) raw_local_irq_disable(); instrumentation_end(); return handled; } DEFINE_IDTENTRY_RAW(exc_invalid_op) { irqentry_state_t state; /* * We use UD2 as a short encoding for 'CALL __WARN', as such * handle it before exception entry to avoid recursive WARN * in case exception entry is the one triggering WARNs. */ if (!user_mode(regs) && handle_bug(regs)) return; state = irqentry_enter(regs); instrumentation_begin(); handle_invalid_op(regs); instrumentation_end(); irqentry_exit(regs, state); } DEFINE_IDTENTRY(exc_coproc_segment_overrun) { do_error_trap(regs, 0, "coprocessor segment overrun", X86_TRAP_OLD_MF, SIGFPE, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss) { do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present) { do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP, SIGBUS, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment) { do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check) { char *str = "alignment check"; if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP) return; if (!user_mode(regs)) die("Split lock detected\n", regs, error_code); local_irq_enable(); if (handle_user_split_lock(regs, error_code)) goto out; do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs, error_code, BUS_ADRALN, NULL); out: local_irq_disable(); } #ifdef CONFIG_VMAP_STACK __visible void __noreturn handle_stack_overflow(struct pt_regs *regs, unsigned long fault_address, struct stack_info *info) { const char *name = stack_type_name(info->type); printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n", name, (void *)fault_address, info->begin, info->end); die("stack guard page", regs, 0); /* Be absolutely certain we don't return. */ panic("%s stack guard hit", name); } #endif /* * Runs on an IST stack for x86_64 and on a special task stack for x86_32. * * On x86_64, this is more or less a normal kernel entry. Notwithstanding the * SDM's warnings about double faults being unrecoverable, returning works as * expected. Presumably what the SDM actually means is that the CPU may get * the register state wrong on entry, so returning could be a bad idea. * * Various CPU engineers have promised that double faults due to an IRET fault * while the stack is read-only are, in fact, recoverable. * * On x86_32, this is entered through a task gate, and regs are synthesized * from the TSS. Returning is, in principle, okay, but changes to regs will * be lost. If, for some reason, we need to return to a context with modified * regs, the shim code could be adjusted to synchronize the registers. * * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs * to be read before doing anything else. */ DEFINE_IDTENTRY_DF(exc_double_fault) { static const char str[] = "double fault"; struct task_struct *tsk = current; #ifdef CONFIG_VMAP_STACK unsigned long address = read_cr2(); struct stack_info info; #endif #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; /* * If IRET takes a non-IST fault on the espfix64 stack, then we * end up promoting it to a doublefault. In that case, take * advantage of the fact that we're not using the normal (TSS.sp0) * stack right now. We can write a fake #GP(0) frame at TSS.sp0 * and then modify our own IRET frame so that, when we return, * we land directly at the #GP(0) vector with the stack already * set up according to its expectations. * * The net result is that our #GP handler will think that we * entered from usermode with the bad user context. * * No need for nmi_enter() here because we don't use RCU. */ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; unsigned long *p = (unsigned long *)regs->sp; /* * regs->sp points to the failing IRET frame on the * ESPFIX64 stack. Copy it to the entry stack. This fills * in gpregs->ss through gpregs->ip. * */ gpregs->ip = p[0]; gpregs->cs = p[1]; gpregs->flags = p[2]; gpregs->sp = p[3]; gpregs->ss = p[4]; gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ /* * Adjust our frame so that we return straight to the #GP * vector with the expected RSP value. This is safe because * we won't enable interrupts or schedule before we invoke * general_protection, so nothing will clobber the stack * frame we just set up. * * We will enter general_protection with kernel GSBASE, * which is what the stub expects, given that the faulting * RIP will be the IRET instruction. */ regs->ip = (unsigned long)asm_exc_general_protection; regs->sp = (unsigned long)&gpregs->orig_ax; return; } #endif irqentry_nmi_enter(regs); instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_DF; #ifdef CONFIG_VMAP_STACK /* * If we overflow the stack into a guard page, the CPU will fail * to deliver #PF and will send #DF instead. Similarly, if we * take any non-IST exception while too close to the bottom of * the stack, the processor will get a page fault while * delivering the exception and will generate a double fault. * * According to the SDM (footnote in 6.15 under "Interrupt 14 - * Page-Fault Exception (#PF): * * Processors update CR2 whenever a page fault is detected. If a * second page fault occurs while an earlier page fault is being * delivered, the faulting linear address of the second fault will * overwrite the contents of CR2 (replacing the previous * address). These updates to CR2 occur even if the page fault * results in a double fault or occurs during the delivery of a * double fault. * * The logic below has a small possibility of incorrectly diagnosing * some errors as stack overflows. For example, if the IDT or GDT * gets corrupted such that #GP delivery fails due to a bad descriptor * causing #GP and we hit this condition while CR2 coincidentally * points to the stack guard page, we'll think we overflowed the * stack. Given that we're going to panic one way or another * if this happens, this isn't necessarily worth fixing. * * If necessary, we could improve the test by only diagnosing * a stack overflow if the saved RSP points within 47 bytes of * the bottom of the stack: if RSP == tsk_stack + 48 and we * take an exception, the stack is already aligned and there * will be enough room SS, RSP, RFLAGS, CS, RIP, and a * possible error code, so a stack overflow would *not* double * fault. With any less space left, exception delivery could * fail, and, as a practical matter, we've overflowed the * stack even if the actual trigger for the double fault was * something else. */ if (get_stack_guard_info((void *)address, &info)) handle_stack_overflow(regs, address, &info); #endif pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); die("double fault", regs, error_code); panic("Machine halted."); instrumentation_end(); } DEFINE_IDTENTRY(exc_bounds) { if (notify_die(DIE_TRAP, "bounds", regs, 0, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) return; cond_local_irq_enable(regs); if (!user_mode(regs)) die("bounds", regs, 0); do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL); cond_local_irq_disable(regs); } enum kernel_gp_hint { GP_NO_HINT, GP_NON_CANONICAL, GP_CANONICAL }; /* * When an uncaught #GP occurs, try to determine the memory address accessed by * the instruction and return that address to the caller. Also, try to figure * out whether any part of the access to that address was non-canonical. */ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, unsigned long *addr) { u8 insn_buf[MAX_INSN_SIZE]; struct insn insn; int ret; if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) return GP_NO_HINT; ret = insn_decode_kernel(&insn, insn_buf); if (ret < 0) return GP_NO_HINT; *addr = (unsigned long)insn_get_addr_ref(&insn, regs); if (*addr == -1UL) return GP_NO_HINT; #ifdef CONFIG_X86_64 /* * Check that: * - the operand is not in the kernel half * - the last byte of the operand is not in the user canonical half */ if (*addr < ~__VIRTUAL_MASK && *addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK) return GP_NON_CANONICAL; #endif return GP_CANONICAL; } #define GPFSTR "general protection fault" static bool fixup_iopl_exception(struct pt_regs *regs) { struct thread_struct *t = ¤t->thread; unsigned char byte; unsigned long ip; if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3) return false; if (insn_get_effective_ip(regs, &ip)) return false; if (get_user(byte, (const char __user *)ip)) return false; if (byte != 0xfa && byte != 0xfb) return false; if (!t->iopl_warn && printk_ratelimit()) { pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx", current->comm, task_pid_nr(current), ip); print_vma_addr(KERN_CONT " in ", ip); pr_cont("\n"); t->iopl_warn = 1; } regs->ip += 1; return true; } /* * The unprivileged ENQCMD instruction generates #GPs if the * IA32_PASID MSR has not been populated. If possible, populate * the MSR from a PASID previously allocated to the mm. */ static bool try_fixup_enqcmd_gp(void) { #ifdef CONFIG_ARCH_HAS_CPU_PASID u32 pasid; /* * MSR_IA32_PASID is managed using XSAVE. Directly * writing to the MSR is only possible when fpregs * are valid and the fpstate is not. This is * guaranteed when handling a userspace exception * in *before* interrupts are re-enabled. */ lockdep_assert_irqs_disabled(); /* * Hardware without ENQCMD will not generate * #GPs that can be fixed up here. */ if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) return false; /* * If the mm has not been allocated a * PASID, the #GP can not be fixed up. */ if (!mm_valid_pasid(current->mm)) return false; pasid = mm_get_enqcmd_pasid(current->mm); /* * Did this thread already have its PASID activated? * If so, the #GP must be from something else. */ if (current->pasid_activated) return false; wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); current->pasid_activated = 1; return true; #else return false; #endif } static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, unsigned long error_code, const char *str, unsigned long address) { if (fixup_exception(regs, trapnr, error_code, address)) return true; current->thread.error_code = error_code; current->thread.trap_nr = trapnr; /* * To be potentially processing a kprobe fault and to trust the result * from kprobe_running(), we have to be non-preemptible. */ if (!preemptible() && kprobe_running() && kprobe_fault_handler(regs, trapnr)) return true; return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP; } static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr, unsigned long error_code, const char *str) { current->thread.error_code = error_code; current->thread.trap_nr = trapnr; show_signal(current, SIGSEGV, "", str, regs, error_code); force_sig(SIGSEGV); } DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) { char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; enum kernel_gp_hint hint = GP_NO_HINT; unsigned long gp_addr; if (user_mode(regs) && try_fixup_enqcmd_gp()) return; cond_local_irq_enable(regs); if (static_cpu_has(X86_FEATURE_UMIP)) { if (user_mode(regs) && fixup_umip_exception(regs)) goto exit; } if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); local_irq_disable(); return; } if (user_mode(regs)) { if (fixup_iopl_exception(regs)) goto exit; if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) goto exit; gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc); goto exit; } if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc, 0)) goto exit; if (error_code) snprintf(desc, sizeof(desc), "segment-related " GPFSTR); else hint = get_kernel_gp_address(regs, &gp_addr); if (hint != GP_NO_HINT) snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx", (hint == GP_NON_CANONICAL) ? "probably for non-canonical address" : "maybe for address", gp_addr); /* * KASAN is interested only in the non-canonical case, clear it * otherwise. */ if (hint != GP_NON_CANONICAL) gp_addr = 0; die_addr(desc, regs, error_code, gp_addr); exit: cond_local_irq_disable(regs); } static bool do_int3(struct pt_regs *regs) { int res; #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) return true; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ #ifdef CONFIG_KPROBES if (kprobe_int3_handler(regs)) return true; #endif res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP); return res == NOTIFY_STOP; } NOKPROBE_SYMBOL(do_int3); static void do_int3_user(struct pt_regs *regs) { if (do_int3(regs)) return; cond_local_irq_enable(regs); do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); cond_local_irq_disable(regs); } DEFINE_IDTENTRY_RAW(exc_int3) { /* * poke_int3_handler() is completely self contained code; it does (and * must) *NOT* call out to anything, lest it hits upon yet another * INT3. */ if (poke_int3_handler(regs)) return; /* * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() * and therefore can trigger INT3, hence poke_int3_handler() must * be done before. If the entry came from kernel mode, then use * nmi_enter() because the INT3 could have been hit in any context * including NMI. */ if (user_mode(regs)) { irqentry_enter_from_user_mode(regs); instrumentation_begin(); do_int3_user(regs); instrumentation_end(); irqentry_exit_to_user_mode(regs); } else { irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); if (!do_int3(regs)) die("int3", regs, 0); instrumentation_end(); irqentry_nmi_exit(regs, irq_state); } } #ifdef CONFIG_X86_64 /* * Help handler running on a per-cpu (IST or entry trampoline) stack * to switch to the normal thread stack if the interrupted code was in * user mode. The actual stack switch is done in entry_64.S */ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = (struct pt_regs *)current_top_of_stack() - 1; if (regs != eregs) *regs = *eregs; return regs; } #ifdef CONFIG_AMD_MEM_ENCRYPT asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs) { unsigned long sp, *stack; struct stack_info info; struct pt_regs *regs_ret; /* * In the SYSCALL entry path the RSP value comes from user-space - don't * trust it and switch to the current kernel stack */ if (ip_within_syscall_gap(regs)) { sp = current_top_of_stack(); goto sync; } /* * From here on the RSP value is trusted. Now check whether entry * happened from a safe stack. Not safe are the entry or unknown stacks, * use the fall-back stack instead in this case. */ sp = regs->sp; stack = (unsigned long *)sp; if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY || info.type > STACK_TYPE_EXCEPTION_LAST) sp = __this_cpu_ist_top_va(VC2); sync: /* * Found a safe stack - switch to it as if the entry didn't happen via * IST stack. The code below only copies pt_regs, the real switch happens * in assembly code. */ sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret); regs_ret = (struct pt_regs *)sp; *regs_ret = *regs; return regs_ret; } #endif asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs) { struct pt_regs tmp, *new_stack; /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault * correctly, we want to move our stack frame to where it would * be had we entered directly on the entry stack (rather than * just below the IRET frame) and we want to pretend that the * exception came from the IRET target. */ new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the temporary storage. */ __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8); /* Copy the remainder of the stack from the current stack. */ __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip)); /* Update the entry stack */ __memcpy(new_stack, &tmp, sizeof(tmp)); BUG_ON(!user_mode(new_stack)); return new_stack; } #endif static bool is_sysenter_singlestep(struct pt_regs *regs) { /* * We don't try for precision here. If we're anywhere in the region of * code that can be single-stepped in the SYSENTER entry path, then * assume that this is a useless single-step trap due to SYSENTER * being invoked with TF set. (We don't know in advance exactly * which instructions will be hit because BTF could plausibly * be set.) */ #ifdef CONFIG_X86_32 return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) < (unsigned long)__end_SYSENTER_singlestep_region - (unsigned long)__begin_SYSENTER_singlestep_region; #elif defined(CONFIG_IA32_EMULATION) return (regs->ip - (unsigned long)entry_SYSENTER_compat) < (unsigned long)__end_entry_SYSENTER_compat - (unsigned long)entry_SYSENTER_compat; #else return false; #endif } static __always_inline unsigned long debug_read_clear_dr6(void) { unsigned long dr6; /* * The Intel SDM says: * * Certain debug exceptions may clear bits 0-3. The remaining * contents of the DR6 register are never cleared by the * processor. To avoid confusion in identifying debug * exceptions, debug handlers should clear the register before * returning to the interrupted task. * * Keep it simple: clear DR6 immediately. */ get_debugreg(dr6, 6); set_debugreg(DR6_RESERVED, 6); dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ return dr6; } /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore * it is possible to get a watchpoint trap here from inside the kernel. * However, the code in ./ptrace.c has ensured that the user can * only set watchpoints on userspace addresses. Therefore the in-kernel * watchpoint trap can only occur in code which is reading/writing * from user space. Such code must not hold kernel locks (since it * can equally take a page fault), therefore it is safe to call * force_sig_info even though that claims and releases locks. * * Code in ./signal.c ensures that the debug control register * is restored before we deliver any signal, and therefore that * user code runs with the correct debug control register even though * we clear it here. * * Being careful here means that we don't have to be as careful in a * lot of more complicated places (task switching can be a bit lazy * about restoring all the debug state, and ptrace doesn't have to * find every occurrence of the TF bit that could be saved away even * by user code) * * May run on IST stack. */ static bool notify_debug(struct pt_regs *regs, unsigned long *dr6) { /* * Notifiers will clear bits in @dr6 to indicate the event has been * consumed - hw_breakpoint_handler(), single_stop_cont(). * * Notifiers will set bits in @virtual_dr6 to indicate the desire * for signals - ptrace_triggered(), kgdb_hw_overflow_handler(). */ if (notify_die(DIE_DEBUG, "debug", regs, (long)dr6, 0, SIGTRAP) == NOTIFY_STOP) return true; return false; } static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { /* * Disable breakpoints during exception handling; recursive exceptions * are exceedingly 'fun'. * * Since this function is NOKPROBE, and that also applies to * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a * HW_BREAKPOINT_W on our stack) * * Entry text is excluded for HW_BP_X and cpu_entry_area, which * includes the entry stack is excluded for everything. * * For FRED, nested #DB should just work fine. But when a watchpoint or * breakpoint is set in the code path which is executed by #DB handler, * it results in an endless recursion and stack overflow. Thus we stay * with the IDT approach, i.e., save DR7 and disable #DB. */ unsigned long dr7 = local_db_save(); irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); /* * If something gets miswired and we end up here for a user mode * #DB, we will malfunction. */ WARN_ON_ONCE(user_mode(regs)); if (test_thread_flag(TIF_BLOCKSTEP)) { /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." but PTRACE_BLOCKSTEP requested * it for userspace, but we just took a kernel #DB, so re-set * BTF. */ unsigned long debugctl; rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl |= DEBUGCTLMSR_BTF; wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); } /* * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. */ if (!cpu_feature_enabled(X86_FEATURE_FRED) && (dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; /* * The kernel doesn't use INT1 */ if (!dr6) goto out; if (notify_debug(regs, &dr6)) goto out; /* * The kernel doesn't use TF single-step outside of: * * - Kprobes, consumed through kprobe_debug_handler() * - KGDB, consumed through notify_debug() * * So if we get here with DR_STEP set, something is wonky. * * A known way to trigger this is through QEMU's GDB stub, * which leaks #DB into the guest and causes IST recursion. */ if (WARN_ON_ONCE(dr6 & DR_STEP)) regs->flags &= ~X86_EFLAGS_TF; out: instrumentation_end(); irqentry_nmi_exit(regs, irq_state); local_db_restore(dr7); } static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { bool icebp; /* * If something gets miswired and we end up here for a kernel mode * #DB, we will malfunction. */ WARN_ON_ONCE(!user_mode(regs)); /* * NB: We can't easily clear DR7 here because * irqentry_exit_to_usermode() can invoke ptrace, schedule, access * user memory, etc. This means that a recursive #DB is possible. If * this happens, that #DB will hit exc_debug_kernel() and clear DR7. * Since we're not on the IST stack right now, everything will be * fine. */ irqentry_enter_from_user_mode(regs); instrumentation_begin(); /* * Start the virtual/ptrace DR6 value with just the DR_STEP mask * of the real DR6. ptrace_triggered() will set the DR_TRAPn bits. * * Userspace expects DR_STEP to be visible in ptrace_get_debugreg(6) * even if it is not the result of PTRACE_SINGLESTEP. */ current->thread.virtual_dr6 = (dr6 & DR_STEP); /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." Clear TIF_BLOCKSTEP to keep * TIF_BLOCKSTEP in sync with the hardware BTF flag. */ clear_thread_flag(TIF_BLOCKSTEP); /* * If dr6 has no reason to give us about the origin of this trap, * then it's very likely the result of an icebp/int01 trap. * User wants a sigtrap for that. */ icebp = !dr6; if (notify_debug(regs, &dr6)) goto out; /* It's safe to allow irq's after DR6 has been saved */ local_irq_enable(); if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *)regs, 0, X86_TRAP_DB); goto out_irq; } /* #DB for bus lock can only be triggered from userspace. */ if (dr6 & DR_BUS_LOCK) handle_bus_lock(regs); /* Add the virtual_dr6 bits for signals. */ dr6 |= current->thread.virtual_dr6; if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp) send_sigtrap(regs, 0, get_si_code(dr6)); out_irq: local_irq_disable(); out: instrumentation_end(); irqentry_exit_to_user_mode(regs); } #ifdef CONFIG_X86_64 /* IST stack entry */ DEFINE_IDTENTRY_DEBUG(exc_debug) { exc_debug_kernel(regs, debug_read_clear_dr6()); } /* User entry, runs on regular task stack */ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { exc_debug_user(regs, debug_read_clear_dr6()); } #ifdef CONFIG_X86_FRED /* * When occurred on different ring level, i.e., from user or kernel * context, #DB needs to be handled on different stack: User #DB on * current task stack, while kernel #DB on a dedicated stack. * * This is exactly how FRED event delivery invokes an exception * handler: ring 3 event on level 0 stack, i.e., current task stack; * ring 0 event on the #DB dedicated stack specified in the * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception * entry stub doesn't do stack switch. */ DEFINE_FREDENTRY_DEBUG(exc_debug) { /* * FRED #DB stores DR6 on the stack in the format which * debug_read_clear_dr6() returns for the IDT entry points. */ unsigned long dr6 = fred_event_data(regs); if (user_mode(regs)) exc_debug_user(regs, dr6); else exc_debug_kernel(regs, dr6); } #endif /* CONFIG_X86_FRED */ #else /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) { unsigned long dr6 = debug_read_clear_dr6(); if (user_mode(regs)) exc_debug_user(regs, dr6); else exc_debug_kernel(regs, dr6); } #endif /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ static void math_error(struct pt_regs *regs, int trapnr) { struct task_struct *task = current; struct fpu *fpu = &task->thread.fpu; int si_code; char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; cond_local_irq_enable(regs); if (!user_mode(regs)) { if (fixup_exception(regs, trapnr, 0, 0)) goto exit; task->thread.error_code = 0; task->thread.trap_nr = trapnr; if (notify_die(DIE_TRAP, str, regs, 0, trapnr, SIGFPE) != NOTIFY_STOP) die(str, regs, 0); goto exit; } /* * Synchronize the FPU register state to the memory register state * if necessary. This allows the exception handler to inspect it. */ fpu_sync_fpstate(fpu); task->thread.trap_nr = trapnr; task->thread.error_code = 0; si_code = fpu__exception_code(fpu, trapnr); /* Retry when we get spurious exceptions: */ if (!si_code) goto exit; if (fixup_vdso_exception(regs, trapnr, 0, 0)) goto exit; force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); exit: cond_local_irq_disable(regs); } DEFINE_IDTENTRY(exc_coprocessor_error) { math_error(regs, X86_TRAP_MF); } DEFINE_IDTENTRY(exc_simd_coprocessor_error) { if (IS_ENABLED(CONFIG_X86_INVD_BUG)) { /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */ if (!static_cpu_has(X86_FEATURE_XMM)) { __exc_general_protection(regs, 0); return; } } math_error(regs, X86_TRAP_XF); } DEFINE_IDTENTRY(exc_spurious_interrupt_bug) { /* * This addresses a Pentium Pro Erratum: * * PROBLEM: If the APIC subsystem is configured in mixed mode with * Virtual Wire mode implemented through the local APIC, an * interrupt vector of 0Fh (Intel reserved encoding) may be * generated by the local APIC (Int 15). This vector may be * generated upon receipt of a spurious interrupt (an interrupt * which is removed before the system receives the INTA sequence) * instead of the programmed 8259 spurious interrupt vector. * * IMPLICATION: The spurious interrupt vector programmed in the * 8259 is normally handled by an operating system's spurious * interrupt handler. However, a vector of 0Fh is unknown to some * operating systems, which would crash if this erratum occurred. * * In theory this could be limited to 32bit, but the handler is not * hurting and who knows which other CPUs suffer from this. */ } static bool handle_xfd_event(struct pt_regs *regs) { u64 xfd_err; int err; if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) return false; rdmsrl(MSR_IA32_XFD_ERR, xfd_err); if (!xfd_err) return false; wrmsrl(MSR_IA32_XFD_ERR, 0); /* Die if that happens in kernel space */ if (WARN_ON(!user_mode(regs))) return false; local_irq_enable(); err = xfd_enable_feature(xfd_err); switch (err) { case -EPERM: force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs)); break; case -EFAULT: force_sig(SIGSEGV); break; } local_irq_disable(); return true; } DEFINE_IDTENTRY(exc_device_not_available) { unsigned long cr0 = read_cr0(); if (handle_xfd_event(regs)) return; #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { struct math_emu_info info = { }; cond_local_irq_enable(regs); info.regs = regs; math_emulate(&info); cond_local_irq_disable(regs); return; } #endif /* This should not happen. */ if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) { /* Try to fix it up and carry on. */ write_cr0(cr0 & ~X86_CR0_TS); } else { /* * Something terrible happened, and we're better off trying * to kill the task than getting stuck in a never-ending * loop of #NM faults. */ die("unexpected #NM exception", regs, 0); } } #ifdef CONFIG_INTEL_TDX_GUEST #define VE_FAULT_STR "VE fault" static void ve_raise_fault(struct pt_regs *regs, long error_code, unsigned long address) { if (user_mode(regs)) { gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR); return; } if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR, address)) { return; } die_addr(VE_FAULT_STR, regs, error_code, address); } /* * Virtualization Exceptions (#VE) are delivered to TDX guests due to * specific guest actions which may happen in either user space or the * kernel: * * * Specific instructions (WBINVD, for example) * * Specific MSR accesses * * Specific CPUID leaf accesses * * Access to specific guest physical addresses * * In the settings that Linux will run in, virtualization exceptions are * never generated on accesses to normal, TD-private memory that has been * accepted (by BIOS or with tdx_enc_status_changed()). * * Syscall entry code has a critical window where the kernel stack is not * yet set up. Any exception in this window leads to hard to debug issues * and can be exploited for privilege escalation. Exceptions in the NMI * entry code also cause issues. Returning from the exception handler with * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. * * For these reasons, the kernel avoids #VEs during the syscall gap and * the NMI entry code. Entry code paths do not access TD-shared memory, * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves * that might generate #VE. VMM can remove memory from TD at any point, * but access to unaccepted (or missing) private memory leads to VM * termination, not to #VE. * * Similarly to page faults and breakpoints, #VEs are allowed in NMI * handlers once the kernel is ready to deal with nested NMIs. * * During #VE delivery, all interrupts, including NMIs, are blocked until * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads * the VE info. * * If a guest kernel action which would normally cause a #VE occurs in * the interrupt-disabled region before TDGETVEINFO, a #DF (fault * exception) is delivered to the guest which will result in an oops. * * The entry code has been audited carefully for following these expectations. * Changes in the entry code have to be audited for correctness vs. this * aspect. Similarly to #PF, #VE in these places will expose kernel to * privilege escalation or may lead to random crashes. */ DEFINE_IDTENTRY(exc_virtualization_exception) { struct ve_info ve; /* * NMIs/Machine-checks/Interrupts will be in a disabled state * till TDGETVEINFO TDCALL is executed. This ensures that VE * info cannot be overwritten by a nested #VE. */ tdx_get_ve_info(&ve); cond_local_irq_enable(regs); /* * If tdx_handle_virt_exception() could not process * it successfully, treat it as #GP(0) and handle it. */ if (!tdx_handle_virt_exception(regs, &ve)) ve_raise_fault(regs, 0, ve.gla); cond_local_irq_disable(regs); } #endif #ifdef CONFIG_X86_32 DEFINE_IDTENTRY_SW(iret_error) { local_irq_enable(); if (notify_die(DIE_TRAP, "iret exception", regs, 0, X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0, ILL_BADSTK, (void __user *)NULL); } local_irq_disable(); } #endif void __init trap_init(void) { /* Init cpu_entry_area before IST entries are set up */ setup_cpu_entry_areas(); /* Init GHCB memory pages when running as an SEV-ES guest */ sev_es_init_vc_handling(); /* Initialize TSS before setting up traps so ISTs work */ cpu_init_exception_handling(true); /* Setup traps as cpu_init() might #GP */ if (!cpu_feature_enabled(X86_FEATURE_FRED)) idt_setup_traps(); cpu_init(); } |
| 84 84 5 84 83 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux I2C core OF support code * * Copyright (C) 2008 Jochen Friedrich <jochen@scram.de> * based on a previous patch from Jon Smirl <jonsmirl@gmail.com> * * Copyright (C) 2013, 2018 Wolfram Sang <wsa@kernel.org> */ #include <dt-bindings/i2c/i2c.h> #include <linux/device.h> #include <linux/err.h> #include <linux/i2c.h> #include <linux/module.h> #include <linux/of.h> #include <linux/of_device.h> #include <linux/sysfs.h> #include "i2c-core.h" int of_i2c_get_board_info(struct device *dev, struct device_node *node, struct i2c_board_info *info) { u32 addr; int ret; memset(info, 0, sizeof(*info)); if (of_alias_from_compatible(node, info->type, sizeof(info->type)) < 0) { dev_err(dev, "of_i2c: modalias failure on %pOF\n", node); return -EINVAL; } ret = of_property_read_u32(node, "reg", &addr); if (ret) { dev_err(dev, "of_i2c: invalid reg on %pOF\n", node); return ret; } if (addr & I2C_TEN_BIT_ADDRESS) { addr &= ~I2C_TEN_BIT_ADDRESS; info->flags |= I2C_CLIENT_TEN; } if (addr & I2C_OWN_SLAVE_ADDRESS) { addr &= ~I2C_OWN_SLAVE_ADDRESS; info->flags |= I2C_CLIENT_SLAVE; } info->addr = addr; info->of_node = node; info->fwnode = of_fwnode_handle(node); if (of_property_read_bool(node, "host-notify")) info->flags |= I2C_CLIENT_HOST_NOTIFY; if (of_property_read_bool(node, "wakeup-source")) info->flags |= I2C_CLIENT_WAKE; return 0; } EXPORT_SYMBOL_GPL(of_i2c_get_board_info); static struct i2c_client *of_i2c_register_device(struct i2c_adapter *adap, struct device_node *node) { struct i2c_client *client; struct i2c_board_info info; int ret; dev_dbg(&adap->dev, "of_i2c: register %pOF\n", node); ret = of_i2c_get_board_info(&adap->dev, node, &info); if (ret) return ERR_PTR(ret); client = i2c_new_client_device(adap, &info); if (IS_ERR(client)) dev_err(&adap->dev, "of_i2c: Failure registering %pOF\n", node); return client; } void of_i2c_register_devices(struct i2c_adapter *adap) { struct device_node *bus, *node; struct i2c_client *client; /* Only register child devices if the adapter has a node pointer set */ if (!adap->dev.of_node) return; dev_dbg(&adap->dev, "of_i2c: walking child nodes\n"); bus = of_get_child_by_name(adap->dev.of_node, "i2c-bus"); if (!bus) bus = of_node_get(adap->dev.of_node); for_each_available_child_of_node(bus, node) { if (of_node_test_and_set_flag(node, OF_POPULATED)) continue; client = of_i2c_register_device(adap, node); if (IS_ERR(client)) { dev_err(&adap->dev, "Failed to create I2C device for %pOF\n", node); of_node_clear_flag(node, OF_POPULATED); } } of_node_put(bus); } static const struct of_device_id* i2c_of_match_device_sysfs(const struct of_device_id *matches, struct i2c_client *client) { const char *name; for (; matches->compatible[0]; matches++) { /* * Adding devices through the i2c sysfs interface provides us * a string to match which may be compatible with the device * tree compatible strings, however with no actual of_node the * of_match_device() will not match */ if (sysfs_streq(client->name, matches->compatible)) return matches; name = strchr(matches->compatible, ','); if (!name) name = matches->compatible; else name++; if (sysfs_streq(client->name, name)) return matches; } return NULL; } const struct of_device_id *i2c_of_match_device(const struct of_device_id *matches, struct i2c_client *client) { const struct of_device_id *match; if (!(client && matches)) return NULL; match = of_match_device(matches, &client->dev); if (match) return match; return i2c_of_match_device_sysfs(matches, client); } EXPORT_SYMBOL_GPL(i2c_of_match_device); #if IS_ENABLED(CONFIG_OF_DYNAMIC) static int of_i2c_notify(struct notifier_block *nb, unsigned long action, void *arg) { struct of_reconfig_data *rd = arg; struct i2c_adapter *adap; struct i2c_client *client; switch (of_reconfig_get_state_change(action, rd)) { case OF_RECONFIG_CHANGE_ADD: adap = of_find_i2c_adapter_by_node(rd->dn->parent); if (adap == NULL) return NOTIFY_OK; /* not for us */ if (of_node_test_and_set_flag(rd->dn, OF_POPULATED)) { put_device(&adap->dev); return NOTIFY_OK; } /* * Clear the flag before adding the device so that fw_devlink * doesn't skip adding consumers to this device. */ rd->dn->fwnode.flags &= ~FWNODE_FLAG_NOT_DEVICE; client = of_i2c_register_device(adap, rd->dn); if (IS_ERR(client)) { dev_err(&adap->dev, "failed to create client for '%pOF'\n", rd->dn); put_device(&adap->dev); of_node_clear_flag(rd->dn, OF_POPULATED); return notifier_from_errno(PTR_ERR(client)); } put_device(&adap->dev); break; case OF_RECONFIG_CHANGE_REMOVE: /* already depopulated? */ if (!of_node_check_flag(rd->dn, OF_POPULATED)) return NOTIFY_OK; /* find our device by node */ client = of_find_i2c_device_by_node(rd->dn); if (client == NULL) return NOTIFY_OK; /* no? not meant for us */ /* unregister takes one ref away */ i2c_unregister_device(client); /* and put the reference of the find */ put_device(&client->dev); break; } return NOTIFY_OK; } struct notifier_block i2c_of_notifier = { .notifier_call = of_i2c_notify, }; #endif /* CONFIG_OF_DYNAMIC */ |
| 114 53 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * * This file is part of the SCTP kernel implementation * * These are definitions needed by the state machine. * * Please send any bug reports or fixes you make to the * email addresses: * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Xingang Guo <xingang.guo@intel.com> * Jon Grimm <jgrimm@us.ibm.com> * Dajiang Zhang <dajiang.zhang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * Kevin Gao <kevin.gao@intel.com> */ #include <linux/types.h> #include <linux/compiler.h> #include <linux/slab.h> #include <linux/in.h> #include <net/sctp/command.h> #include <net/sctp/sctp.h> #ifndef __sctp_sm_h__ #define __sctp_sm_h__ /* * Possible values for the disposition are: */ enum sctp_disposition { SCTP_DISPOSITION_DISCARD, /* No further processing. */ SCTP_DISPOSITION_CONSUME, /* Process return values normally. */ SCTP_DISPOSITION_NOMEM, /* We ran out of memory--recover. */ SCTP_DISPOSITION_DELETE_TCB, /* Close the association. */ SCTP_DISPOSITION_ABORT, /* Close the association NOW. */ SCTP_DISPOSITION_VIOLATION, /* The peer is misbehaving. */ SCTP_DISPOSITION_NOT_IMPL, /* This entry is not implemented. */ SCTP_DISPOSITION_ERROR, /* This is plain old user error. */ SCTP_DISPOSITION_BUG, /* This is a bug. */ }; typedef enum sctp_disposition (sctp_state_fn_t) ( struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, const union sctp_subtype type, void *arg, struct sctp_cmd_seq *commands); typedef void (sctp_timer_event_t) (struct timer_list *); struct sctp_sm_table_entry { sctp_state_fn_t *fn; const char *name; }; /* A naming convention of "sctp_sf_xxx" applies to all the state functions * currently in use. */ /* Prototypes for generic state functions. */ sctp_state_fn_t sctp_sf_not_impl; sctp_state_fn_t sctp_sf_bug; /* Prototypes for gener timer state functions. */ sctp_state_fn_t sctp_sf_timer_ignore; /* Prototypes for chunk state functions. */ sctp_state_fn_t sctp_sf_do_9_1_abort; sctp_state_fn_t sctp_sf_cookie_wait_abort; sctp_state_fn_t sctp_sf_cookie_echoed_abort; sctp_state_fn_t sctp_sf_shutdown_pending_abort; sctp_state_fn_t sctp_sf_shutdown_sent_abort; sctp_state_fn_t sctp_sf_shutdown_ack_sent_abort; sctp_state_fn_t sctp_sf_do_5_1B_init; sctp_state_fn_t sctp_sf_do_5_1C_ack; sctp_state_fn_t sctp_sf_do_5_1D_ce; sctp_state_fn_t sctp_sf_do_5_1E_ca; sctp_state_fn_t sctp_sf_do_4_C; sctp_state_fn_t sctp_sf_eat_data_6_2; sctp_state_fn_t sctp_sf_eat_data_fast_4_4; sctp_state_fn_t sctp_sf_eat_sack_6_2; sctp_state_fn_t sctp_sf_operr_notify; sctp_state_fn_t sctp_sf_t1_init_timer_expire; sctp_state_fn_t sctp_sf_t1_cookie_timer_expire; sctp_state_fn_t sctp_sf_t2_timer_expire; sctp_state_fn_t sctp_sf_t4_timer_expire; sctp_state_fn_t sctp_sf_t5_timer_expire; sctp_state_fn_t sctp_sf_sendbeat_8_3; sctp_state_fn_t sctp_sf_beat_8_3; sctp_state_fn_t sctp_sf_backbeat_8_3; sctp_state_fn_t sctp_sf_do_9_2_final; sctp_state_fn_t sctp_sf_do_9_2_shutdown; sctp_state_fn_t sctp_sf_do_9_2_shut_ctsn; sctp_state_fn_t sctp_sf_do_ecn_cwr; sctp_state_fn_t sctp_sf_do_ecne; sctp_state_fn_t sctp_sf_ootb; sctp_state_fn_t sctp_sf_pdiscard; sctp_state_fn_t sctp_sf_violation; sctp_state_fn_t sctp_sf_discard_chunk; sctp_state_fn_t sctp_sf_do_5_2_1_siminit; sctp_state_fn_t sctp_sf_do_5_2_2_dupinit; sctp_state_fn_t sctp_sf_do_5_2_3_initack; sctp_state_fn_t sctp_sf_do_5_2_4_dupcook; sctp_state_fn_t sctp_sf_unk_chunk; sctp_state_fn_t sctp_sf_do_8_5_1_E_sa; sctp_state_fn_t sctp_sf_cookie_echoed_err; sctp_state_fn_t sctp_sf_do_asconf; sctp_state_fn_t sctp_sf_do_asconf_ack; sctp_state_fn_t sctp_sf_do_reconf; sctp_state_fn_t sctp_sf_do_9_2_reshutack; sctp_state_fn_t sctp_sf_eat_fwd_tsn; sctp_state_fn_t sctp_sf_eat_fwd_tsn_fast; sctp_state_fn_t sctp_sf_eat_auth; /* Prototypes for primitive event state functions. */ sctp_state_fn_t sctp_sf_do_prm_asoc; sctp_state_fn_t sctp_sf_do_prm_send; sctp_state_fn_t sctp_sf_do_9_2_prm_shutdown; sctp_state_fn_t sctp_sf_cookie_wait_prm_shutdown; sctp_state_fn_t sctp_sf_cookie_echoed_prm_shutdown; sctp_state_fn_t sctp_sf_do_9_1_prm_abort; sctp_state_fn_t sctp_sf_cookie_wait_prm_abort; sctp_state_fn_t sctp_sf_cookie_echoed_prm_abort; sctp_state_fn_t sctp_sf_shutdown_pending_prm_abort; sctp_state_fn_t sctp_sf_shutdown_sent_prm_abort; sctp_state_fn_t sctp_sf_shutdown_ack_sent_prm_abort; sctp_state_fn_t sctp_sf_error_closed; sctp_state_fn_t sctp_sf_error_shutdown; sctp_state_fn_t sctp_sf_ignore_primitive; sctp_state_fn_t sctp_sf_do_prm_requestheartbeat; sctp_state_fn_t sctp_sf_do_prm_asconf; sctp_state_fn_t sctp_sf_do_prm_reconf; /* Prototypes for other event state functions. */ sctp_state_fn_t sctp_sf_do_no_pending_tsn; sctp_state_fn_t sctp_sf_do_9_2_start_shutdown; sctp_state_fn_t sctp_sf_do_9_2_shutdown_ack; sctp_state_fn_t sctp_sf_ignore_other; sctp_state_fn_t sctp_sf_cookie_wait_icmp_abort; /* Prototypes for timeout event state functions. */ sctp_state_fn_t sctp_sf_do_6_3_3_rtx; sctp_state_fn_t sctp_sf_send_reconf; sctp_state_fn_t sctp_sf_send_probe; sctp_state_fn_t sctp_sf_do_6_2_sack; sctp_state_fn_t sctp_sf_autoclose_timer_expire; /* Prototypes for utility support functions. */ const struct sctp_sm_table_entry *sctp_sm_lookup_event( struct net *net, enum sctp_event_type event_type, enum sctp_state state, union sctp_subtype event_subtype); int sctp_chunk_iif(const struct sctp_chunk *); struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *, struct sctp_chunk *, gfp_t gfp); /* Prototypes for chunk-building functions. */ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, const struct sctp_bind_addr *bp, gfp_t gfp, int vparam_len); struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const gfp_t gfp, const int unkparam_len); struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc, const __u32 lowest_tsn, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp); struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_ifwdtsn_skip *skiplist); struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp); struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc, const __u32 lowest_tsn); struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc); struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_shutdown_complete( const struct sctp_association *asoc, const struct sctp_chunk *chunk); int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause, size_t paylen); struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const size_t hint); struct sctp_chunk *sctp_make_abort_no_data(const struct sctp_association *asoc, const struct sctp_chunk *chunk, __u32 tsn); struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc, struct msghdr *msg, size_t msg_len); struct sctp_chunk *sctp_make_abort_violation( const struct sctp_association *asoc, const struct sctp_chunk *chunk, const __u8 *payload, const size_t paylen); struct sctp_chunk *sctp_make_violation_paramlen( const struct sctp_association *asoc, const struct sctp_chunk *chunk, struct sctp_paramhdr *param); struct sctp_chunk *sctp_make_violation_max_retrans( const struct sctp_association *asoc, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_new_encap_port( const struct sctp_association *asoc, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, const struct sctp_transport *transport, __u32 probe_size); struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const void *payload, const size_t paylen); struct sctp_chunk *sctp_make_pad(const struct sctp_association *asoc, int len); struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc, const struct sctp_chunk *chunk, __be16 cause_code, const void *payload, size_t paylen, size_t reserve_tail); struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc, union sctp_addr *laddr, struct sockaddr *addrs, int addrcnt, __be16 flags); struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc, union sctp_addr *addr); bool sctp_verify_asconf(const struct sctp_association *asoc, struct sctp_chunk *chunk, bool addr_param_needed, struct sctp_paramhdr **errp); struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, struct sctp_chunk *asconf); int sctp_process_asconf_ack(struct sctp_association *asoc, struct sctp_chunk *asconf_ack); struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_fwdtsn_skip *skiplist); struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc, __u16 key_id); struct sctp_chunk *sctp_make_strreset_req(const struct sctp_association *asoc, __u16 stream_num, __be16 *stream_list, bool out, bool in); struct sctp_chunk *sctp_make_strreset_tsnreq( const struct sctp_association *asoc); struct sctp_chunk *sctp_make_strreset_addstrm( const struct sctp_association *asoc, __u16 out, __u16 in); struct sctp_chunk *sctp_make_strreset_resp(const struct sctp_association *asoc, __u32 result, __u32 sn); struct sctp_chunk *sctp_make_strreset_tsnresp(struct sctp_association *asoc, __u32 result, __u32 sn, __u32 sender_tsn, __u32 receiver_tsn); bool sctp_verify_reconf(const struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_paramhdr **errp); void sctp_chunk_assign_tsn(struct sctp_chunk *chunk); void sctp_chunk_assign_ssn(struct sctp_chunk *chunk); /* Prototypes for stream-processing functions. */ struct sctp_chunk *sctp_process_strreset_outreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp); struct sctp_chunk *sctp_process_strreset_inreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp); struct sctp_chunk *sctp_process_strreset_tsnreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp); struct sctp_chunk *sctp_process_strreset_addstrm_out( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp); struct sctp_chunk *sctp_process_strreset_addstrm_in( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp); struct sctp_chunk *sctp_process_strreset_resp( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp); /* Prototypes for statetable processing. */ int sctp_do_sm(struct net *net, enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association *asoc, void *event_arg, gfp_t gfp); /* 2nd level prototypes */ void sctp_generate_t3_rtx_event(struct timer_list *t); void sctp_generate_heartbeat_event(struct timer_list *t); void sctp_generate_reconf_event(struct timer_list *t); void sctp_generate_probe_event(struct timer_list *t); void sctp_generate_proto_unreach_event(struct timer_list *t); void sctp_ootb_pkt_free(struct sctp_packet *packet); struct sctp_association *sctp_unpack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, struct sctp_chunk *chunk, gfp_t gfp, int *err, struct sctp_chunk **err_chk_p); /* 3rd level prototypes */ __u32 sctp_generate_tag(const struct sctp_endpoint *ep); __u32 sctp_generate_tsn(const struct sctp_endpoint *ep); /* Extern declarations for major data structures. */ extern sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES]; /* Get the size of a DATA chunk payload. */ static inline __u16 sctp_data_size(struct sctp_chunk *chunk) { __u16 size; size = ntohs(chunk->chunk_hdr->length); size -= sctp_datachk_len(&chunk->asoc->stream); return size; } /* Compare two TSNs */ #define TSN_lt(a,b) \ (typecheck(__u32, a) && \ typecheck(__u32, b) && \ ((__s32)((a) - (b)) < 0)) #define TSN_lte(a,b) \ (typecheck(__u32, a) && \ typecheck(__u32, b) && \ ((__s32)((a) - (b)) <= 0)) /* Compare two MIDs */ #define MID_lt(a, b) \ (typecheck(__u32, a) && \ typecheck(__u32, b) && \ ((__s32)((a) - (b)) < 0)) /* Compare two SSNs */ #define SSN_lt(a,b) \ (typecheck(__u16, a) && \ typecheck(__u16, b) && \ ((__s16)((a) - (b)) < 0)) /* ADDIP 3.1.1 */ #define ADDIP_SERIAL_gte(a,b) \ (typecheck(__u32, a) && \ typecheck(__u32, b) && \ ((__s32)((b) - (a)) <= 0)) /* Check VTAG of the packet matches the sender's own tag. */ static inline int sctp_vtag_verify(const struct sctp_chunk *chunk, const struct sctp_association *asoc) { /* RFC 2960 Sec 8.5 When receiving an SCTP packet, the endpoint * MUST ensure that the value in the Verification Tag field of * the received SCTP packet matches its own Tag. If the received * Verification Tag value does not match the receiver's own * tag value, the receiver shall silently discard the packet... */ if (ntohl(chunk->sctp_hdr->vtag) != asoc->c.my_vtag) return 0; chunk->transport->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port; return 1; } /* Check VTAG of the packet matches the sender's own tag and the T bit is * not set, OR its peer's tag and the T bit is set in the Chunk Flags. */ static inline int sctp_vtag_verify_either(const struct sctp_chunk *chunk, const struct sctp_association *asoc) { /* RFC 2960 Section 8.5.1, sctpimpguide Section 2.41 * * B) The receiver of a ABORT MUST accept the packet * if the Verification Tag field of the packet matches its own tag * and the T bit is not set * OR * it is set to its peer's tag and the T bit is set in the Chunk * Flags. * Otherwise, the receiver MUST silently discard the packet * and take no further action. * * C) The receiver of a SHUTDOWN COMPLETE shall accept the packet * if the Verification Tag field of the packet matches its own tag * and the T bit is not set * OR * it is set to its peer's tag and the T bit is set in the Chunk * Flags. * Otherwise, the receiver MUST silently discard the packet * and take no further action. An endpoint MUST ignore the * SHUTDOWN COMPLETE if it is not in the SHUTDOWN-ACK-SENT state. */ if ((!sctp_test_T_bit(chunk) && (ntohl(chunk->sctp_hdr->vtag) == asoc->c.my_vtag)) || (sctp_test_T_bit(chunk) && asoc->c.peer_vtag && (ntohl(chunk->sctp_hdr->vtag) == asoc->c.peer_vtag))) { return 1; } return 0; } #endif /* __sctp_sm_h__ */ |
| 1 1 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 | // SPDX-License-Identifier: GPL-2.0-or-later /* * corsair-psu.c - Linux driver for Corsair power supplies with HID sensors interface * Copyright (C) 2020 Wilken Gottwalt <wilken.gottwalt@posteo.net> */ #include <linux/completion.h> #include <linux/debugfs.h> #include <linux/errno.h> #include <linux/hid.h> #include <linux/hwmon.h> #include <linux/hwmon-sysfs.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/types.h> /* * Corsair protocol for PSUs * * message size = 64 bytes (request and response, little endian) * request: * [length][command][param0][param1][paramX]... * reply: * [echo of length][echo of command][data0][data1][dataX]... * * - commands are byte sized opcodes * - length is the sum of all bytes of the commands/params * - the micro-controller of most of these PSUs support concatenation in the request and reply, * but it is better to not rely on this (it is also hard to parse) * - the driver uses raw events to be accessible from userspace (though this is not really * supported, it is just there for convenience, may be removed in the future) * - a reply always starts with the length and command in the same order the request used it * - length of the reply data is specific to the command used * - some of the commands work on a rail and can be switched to a specific rail (0 = 12v, * 1 = 5v, 2 = 3.3v) * - the format of the init command 0xFE is swapped length/command bytes * - parameter bytes amount and values are specific to the command (rail setting is the only * one for now that uses non-zero values) * - the driver supports debugfs for values not fitting into the hwmon class * - not every device class (HXi or RMi) supports all commands * - if configured wrong the PSU resets or shuts down, often before actually hitting the * reported critical temperature * - new models like HX1500i Series 2023 have changes in the reported vendor and product * strings, both are slightly longer now, report vendor and product in one string and are * the same now */ #define DRIVER_NAME "corsair-psu" #define REPLY_SIZE 24 /* max length of a reply to a single command */ #define CMD_BUFFER_SIZE 64 #define CMD_TIMEOUT_MS 250 #define SECONDS_PER_HOUR (60 * 60) #define SECONDS_PER_DAY (SECONDS_PER_HOUR * 24) #define RAIL_COUNT 3 /* 3v3 + 5v + 12v */ #define TEMP_COUNT 2 #define OCP_MULTI_RAIL 0x02 #define PSU_CMD_SELECT_RAIL 0x00 /* expects length 2 */ #define PSU_CMD_FAN_PWM 0x3B /* the rest of the commands expect length 3 */ #define PSU_CMD_RAIL_VOLTS_HCRIT 0x40 #define PSU_CMD_RAIL_VOLTS_LCRIT 0x44 #define PSU_CMD_RAIL_AMPS_HCRIT 0x46 #define PSU_CMD_TEMP_HCRIT 0x4F #define PSU_CMD_IN_VOLTS 0x88 #define PSU_CMD_IN_AMPS 0x89 #define PSU_CMD_RAIL_VOLTS 0x8B #define PSU_CMD_RAIL_AMPS 0x8C #define PSU_CMD_TEMP0 0x8D #define PSU_CMD_TEMP1 0x8E #define PSU_CMD_FAN 0x90 #define PSU_CMD_RAIL_WATTS 0x96 #define PSU_CMD_VEND_STR 0x99 #define PSU_CMD_PROD_STR 0x9A #define PSU_CMD_TOTAL_UPTIME 0xD1 #define PSU_CMD_UPTIME 0xD2 #define PSU_CMD_OCPMODE 0xD8 #define PSU_CMD_TOTAL_WATTS 0xEE #define PSU_CMD_FAN_PWM_ENABLE 0xF0 #define PSU_CMD_INIT 0xFE #define L_IN_VOLTS "v_in" #define L_OUT_VOLTS_12V "v_out +12v" #define L_OUT_VOLTS_5V "v_out +5v" #define L_OUT_VOLTS_3_3V "v_out +3.3v" #define L_IN_AMPS "curr in" #define L_AMPS_12V "curr +12v" #define L_AMPS_5V "curr +5v" #define L_AMPS_3_3V "curr +3.3v" #define L_FAN "psu fan" #define L_TEMP0 "vrm temp" #define L_TEMP1 "case temp" #define L_WATTS "power total" #define L_WATTS_12V "power +12v" #define L_WATTS_5V "power +5v" #define L_WATTS_3_3V "power +3.3v" static const char *const label_watts[] = { L_WATTS, L_WATTS_12V, L_WATTS_5V, L_WATTS_3_3V }; static const char *const label_volts[] = { L_IN_VOLTS, L_OUT_VOLTS_12V, L_OUT_VOLTS_5V, L_OUT_VOLTS_3_3V }; static const char *const label_amps[] = { L_IN_AMPS, L_AMPS_12V, L_AMPS_5V, L_AMPS_3_3V }; struct corsairpsu_data { struct hid_device *hdev; struct device *hwmon_dev; struct dentry *debugfs; struct completion wait_completion; struct mutex lock; /* for locking access to cmd_buffer */ u8 *cmd_buffer; char vendor[REPLY_SIZE]; char product[REPLY_SIZE]; long temp_crit[TEMP_COUNT]; long in_crit[RAIL_COUNT]; long in_lcrit[RAIL_COUNT]; long curr_crit[RAIL_COUNT]; u8 temp_crit_support; u8 in_crit_support; u8 in_lcrit_support; u8 curr_crit_support; bool in_curr_cmd_support; /* not all commands are supported on every PSU */ }; /* some values are SMBus LINEAR11 data which need a conversion */ static int corsairpsu_linear11_to_int(const u16 val, const int scale) { const int exp = ((s16)val) >> 11; const int mant = (((s16)(val & 0x7ff)) << 5) >> 5; const int result = mant * scale; return (exp >= 0) ? (result << exp) : (result >> -exp); } /* the micro-controller uses percentage values to control pwm */ static int corsairpsu_dutycycle_to_pwm(const long dutycycle) { const int result = (256 << 16) / 100; return (result * dutycycle) >> 16; } static int corsairpsu_usb_cmd(struct corsairpsu_data *priv, u8 p0, u8 p1, u8 p2, void *data) { unsigned long time; int ret; memset(priv->cmd_buffer, 0, CMD_BUFFER_SIZE); priv->cmd_buffer[0] = p0; priv->cmd_buffer[1] = p1; priv->cmd_buffer[2] = p2; reinit_completion(&priv->wait_completion); ret = hid_hw_output_report(priv->hdev, priv->cmd_buffer, CMD_BUFFER_SIZE); if (ret < 0) return ret; time = wait_for_completion_timeout(&priv->wait_completion, msecs_to_jiffies(CMD_TIMEOUT_MS)); if (!time) return -ETIMEDOUT; /* * at the start of the reply is an echo of the send command/length in the same order it * was send, not every command is supported on every device class, if a command is not * supported, the length value in the reply is okay, but the command value is set to 0 */ if (p0 != priv->cmd_buffer[0] || p1 != priv->cmd_buffer[1]) return -EOPNOTSUPP; if (data) memcpy(data, priv->cmd_buffer + 2, REPLY_SIZE); return 0; } static int corsairpsu_init(struct corsairpsu_data *priv) { /* * PSU_CMD_INIT uses swapped length/command and expects 2 parameter bytes, this command * actually generates a reply, but we don't need it */ return corsairpsu_usb_cmd(priv, PSU_CMD_INIT, 3, 0, NULL); } static int corsairpsu_fwinfo(struct corsairpsu_data *priv) { int ret; ret = corsairpsu_usb_cmd(priv, 3, PSU_CMD_VEND_STR, 0, priv->vendor); if (ret < 0) return ret; ret = corsairpsu_usb_cmd(priv, 3, PSU_CMD_PROD_STR, 0, priv->product); if (ret < 0) return ret; return 0; } static int corsairpsu_request(struct corsairpsu_data *priv, u8 cmd, u8 rail, void *data) { int ret; mutex_lock(&priv->lock); switch (cmd) { case PSU_CMD_RAIL_VOLTS_HCRIT: case PSU_CMD_RAIL_VOLTS_LCRIT: case PSU_CMD_RAIL_AMPS_HCRIT: case PSU_CMD_RAIL_VOLTS: case PSU_CMD_RAIL_AMPS: case PSU_CMD_RAIL_WATTS: ret = corsairpsu_usb_cmd(priv, 2, PSU_CMD_SELECT_RAIL, rail, NULL); if (ret < 0) goto cmd_fail; break; default: break; } ret = corsairpsu_usb_cmd(priv, 3, cmd, 0, data); cmd_fail: mutex_unlock(&priv->lock); return ret; } static int corsairpsu_get_value(struct corsairpsu_data *priv, u8 cmd, u8 rail, long *val) { u8 data[REPLY_SIZE]; long tmp; int ret; ret = corsairpsu_request(priv, cmd, rail, data); if (ret < 0) return ret; /* * the biggest value here comes from the uptime command and to exceed MAXINT total uptime * needs to be about 68 years, the rest are u16 values and the biggest value coming out of * the LINEAR11 conversion are the watts values which are about 1500 for the strongest psu * supported (HX1500i) */ tmp = ((long)data[3] << 24) + (data[2] << 16) + (data[1] << 8) + data[0]; switch (cmd) { case PSU_CMD_RAIL_VOLTS_HCRIT: case PSU_CMD_RAIL_VOLTS_LCRIT: case PSU_CMD_RAIL_AMPS_HCRIT: case PSU_CMD_TEMP_HCRIT: case PSU_CMD_IN_VOLTS: case PSU_CMD_IN_AMPS: case PSU_CMD_RAIL_VOLTS: case PSU_CMD_RAIL_AMPS: case PSU_CMD_TEMP0: case PSU_CMD_TEMP1: *val = corsairpsu_linear11_to_int(tmp & 0xFFFF, 1000); break; case PSU_CMD_FAN: *val = corsairpsu_linear11_to_int(tmp & 0xFFFF, 1); break; case PSU_CMD_FAN_PWM_ENABLE: *val = corsairpsu_linear11_to_int(tmp & 0xFFFF, 1); /* * 0 = automatic mode, means the micro-controller controls the fan using a plan * which can be modified, but changing this plan is not supported by this * driver, the matching PWM mode is automatic fan speed control = PWM 2 * 1 = fixed mode, fan runs at a fixed speed represented by a percentage * value 0-100, this matches the PWM manual fan speed control = PWM 1 * technically there is no PWM no fan speed control mode, it would be a combination * of 1 at 100% */ if (*val == 0) *val = 2; break; case PSU_CMD_FAN_PWM: *val = corsairpsu_linear11_to_int(tmp & 0xFFFF, 1); *val = corsairpsu_dutycycle_to_pwm(*val); break; case PSU_CMD_RAIL_WATTS: case PSU_CMD_TOTAL_WATTS: *val = corsairpsu_linear11_to_int(tmp & 0xFFFF, 1000000); break; case PSU_CMD_TOTAL_UPTIME: case PSU_CMD_UPTIME: case PSU_CMD_OCPMODE: *val = tmp; break; default: ret = -EOPNOTSUPP; break; } return ret; } static void corsairpsu_get_criticals(struct corsairpsu_data *priv) { long tmp; int rail; for (rail = 0; rail < TEMP_COUNT; ++rail) { if (!corsairpsu_get_value(priv, PSU_CMD_TEMP_HCRIT, rail, &tmp)) { priv->temp_crit_support |= BIT(rail); priv->temp_crit[rail] = tmp; } } for (rail = 0; rail < RAIL_COUNT; ++rail) { if (!corsairpsu_get_value(priv, PSU_CMD_RAIL_VOLTS_HCRIT, rail, &tmp)) { priv->in_crit_support |= BIT(rail); priv->in_crit[rail] = tmp; } if (!corsairpsu_get_value(priv, PSU_CMD_RAIL_VOLTS_LCRIT, rail, &tmp)) { priv->in_lcrit_support |= BIT(rail); priv->in_lcrit[rail] = tmp; } if (!corsairpsu_get_value(priv, PSU_CMD_RAIL_AMPS_HCRIT, rail, &tmp)) { priv->curr_crit_support |= BIT(rail); priv->curr_crit[rail] = tmp; } } } static void corsairpsu_check_cmd_support(struct corsairpsu_data *priv) { long tmp; priv->in_curr_cmd_support = !corsairpsu_get_value(priv, PSU_CMD_IN_AMPS, 0, &tmp); } static umode_t corsairpsu_hwmon_temp_is_visible(const struct corsairpsu_data *priv, u32 attr, int channel) { umode_t res = 0444; switch (attr) { case hwmon_temp_input: case hwmon_temp_label: case hwmon_temp_crit: if (channel > 0 && !(priv->temp_crit_support & BIT(channel - 1))) res = 0; break; default: break; } return res; } static umode_t corsairpsu_hwmon_fan_is_visible(const struct corsairpsu_data *priv, u32 attr, int channel) { switch (attr) { case hwmon_fan_input: case hwmon_fan_label: return 0444; default: return 0; } } static umode_t corsairpsu_hwmon_pwm_is_visible(const struct corsairpsu_data *priv, u32 attr, int channel) { switch (attr) { case hwmon_pwm_input: case hwmon_pwm_enable: return 0444; default: return 0; } } static umode_t corsairpsu_hwmon_power_is_visible(const struct corsairpsu_data *priv, u32 attr, int channel) { switch (attr) { case hwmon_power_input: case hwmon_power_label: return 0444; default: return 0; } } static umode_t corsairpsu_hwmon_in_is_visible(const struct corsairpsu_data *priv, u32 attr, int channel) { umode_t res = 0444; switch (attr) { case hwmon_in_input: case hwmon_in_label: case hwmon_in_crit: if (channel > 0 && !(priv->in_crit_support & BIT(channel - 1))) res = 0; break; case hwmon_in_lcrit: if (channel > 0 && !(priv->in_lcrit_support & BIT(channel - 1))) res = 0; break; default: break; } return res; } static umode_t corsairpsu_hwmon_curr_is_visible(const struct corsairpsu_data *priv, u32 attr, int channel) { umode_t res = 0444; switch (attr) { case hwmon_curr_input: if (channel == 0 && !priv->in_curr_cmd_support) res = 0; break; case hwmon_curr_label: case hwmon_curr_crit: if (channel > 0 && !(priv->curr_crit_support & BIT(channel - 1))) res = 0; break; default: break; } return res; } static umode_t corsairpsu_hwmon_ops_is_visible(const void *data, enum hwmon_sensor_types type, u32 attr, int channel) { const struct corsairpsu_data *priv = data; switch (type) { case hwmon_temp: return corsairpsu_hwmon_temp_is_visible(priv, attr, channel); case hwmon_fan: return corsairpsu_hwmon_fan_is_visible(priv, attr, channel); case hwmon_pwm: return corsairpsu_hwmon_pwm_is_visible(priv, attr, channel); case hwmon_power: return corsairpsu_hwmon_power_is_visible(priv, attr, channel); case hwmon_in: return corsairpsu_hwmon_in_is_visible(priv, attr, channel); case hwmon_curr: return corsairpsu_hwmon_curr_is_visible(priv, attr, channel); default: return 0; } } static int corsairpsu_hwmon_temp_read(struct corsairpsu_data *priv, u32 attr, int channel, long *val) { int err = -EOPNOTSUPP; switch (attr) { case hwmon_temp_input: return corsairpsu_get_value(priv, channel ? PSU_CMD_TEMP1 : PSU_CMD_TEMP0, channel, val); case hwmon_temp_crit: *val = priv->temp_crit[channel]; err = 0; break; default: break; } return err; } static int corsairpsu_hwmon_pwm_read(struct corsairpsu_data *priv, u32 attr, int channel, long *val) { switch (attr) { case hwmon_pwm_input: return corsairpsu_get_value(priv, PSU_CMD_FAN_PWM, 0, val); case hwmon_pwm_enable: return corsairpsu_get_value(priv, PSU_CMD_FAN_PWM_ENABLE, 0, val); default: break; } return -EOPNOTSUPP; } static int corsairpsu_hwmon_power_read(struct corsairpsu_data *priv, u32 attr, int channel, long *val) { if (attr == hwmon_power_input) { switch (channel) { case 0: return corsairpsu_get_value(priv, PSU_CMD_TOTAL_WATTS, 0, val); case 1 ... 3: return corsairpsu_get_value(priv, PSU_CMD_RAIL_WATTS, channel - 1, val); default: break; } } return -EOPNOTSUPP; } static int corsairpsu_hwmon_in_read(struct corsairpsu_data *priv, u32 attr, int channel, long *val) { int err = -EOPNOTSUPP; switch (attr) { case hwmon_in_input: switch (channel) { case 0: return corsairpsu_get_value(priv, PSU_CMD_IN_VOLTS, 0, val); case 1 ... 3: return corsairpsu_get_value(priv, PSU_CMD_RAIL_VOLTS, channel - 1, val); default: break; } break; case hwmon_in_crit: *val = priv->in_crit[channel - 1]; err = 0; break; case hwmon_in_lcrit: *val = priv->in_lcrit[channel - 1]; err = 0; break; } return err; } static int corsairpsu_hwmon_curr_read(struct corsairpsu_data *priv, u32 attr, int channel, long *val) { int err = -EOPNOTSUPP; switch (attr) { case hwmon_curr_input: switch (channel) { case 0: return corsairpsu_get_value(priv, PSU_CMD_IN_AMPS, 0, val); case 1 ... 3: return corsairpsu_get_value(priv, PSU_CMD_RAIL_AMPS, channel - 1, val); default: break; } break; case hwmon_curr_crit: *val = priv->curr_crit[channel - 1]; err = 0; break; default: break; } return err; } static int corsairpsu_hwmon_ops_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, long *val) { struct corsairpsu_data *priv = dev_get_drvdata(dev); switch (type) { case hwmon_temp: return corsairpsu_hwmon_temp_read(priv, attr, channel, val); case hwmon_fan: if (attr == hwmon_fan_input) return corsairpsu_get_value(priv, PSU_CMD_FAN, 0, val); return -EOPNOTSUPP; case hwmon_pwm: return corsairpsu_hwmon_pwm_read(priv, attr, channel, val); case hwmon_power: return corsairpsu_hwmon_power_read(priv, attr, channel, val); case hwmon_in: return corsairpsu_hwmon_in_read(priv, attr, channel, val); case hwmon_curr: return corsairpsu_hwmon_curr_read(priv, attr, channel, val); default: return -EOPNOTSUPP; } } static int corsairpsu_hwmon_ops_read_string(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, const char **str) { if (type == hwmon_temp && attr == hwmon_temp_label) { *str = channel ? L_TEMP1 : L_TEMP0; return 0; } else if (type == hwmon_fan && attr == hwmon_fan_label) { *str = L_FAN; return 0; } else if (type == hwmon_power && attr == hwmon_power_label && channel < 4) { *str = label_watts[channel]; return 0; } else if (type == hwmon_in && attr == hwmon_in_label && channel < 4) { *str = label_volts[channel]; return 0; } else if (type == hwmon_curr && attr == hwmon_curr_label && channel < 4) { *str = label_amps[channel]; return 0; } return -EOPNOTSUPP; } static const struct hwmon_ops corsairpsu_hwmon_ops = { .is_visible = corsairpsu_hwmon_ops_is_visible, .read = corsairpsu_hwmon_ops_read, .read_string = corsairpsu_hwmon_ops_read_string, }; static const struct hwmon_channel_info *const corsairpsu_info[] = { HWMON_CHANNEL_INFO(chip, HWMON_C_REGISTER_TZ), HWMON_CHANNEL_INFO(temp, HWMON_T_INPUT | HWMON_T_LABEL | HWMON_T_CRIT, HWMON_T_INPUT | HWMON_T_LABEL | HWMON_T_CRIT), HWMON_CHANNEL_INFO(fan, HWMON_F_INPUT | HWMON_F_LABEL), HWMON_CHANNEL_INFO(pwm, HWMON_PWM_INPUT | HWMON_PWM_ENABLE), HWMON_CHANNEL_INFO(power, HWMON_P_INPUT | HWMON_P_LABEL, HWMON_P_INPUT | HWMON_P_LABEL, HWMON_P_INPUT | HWMON_P_LABEL, HWMON_P_INPUT | HWMON_P_LABEL), HWMON_CHANNEL_INFO(in, HWMON_I_INPUT | HWMON_I_LABEL, HWMON_I_INPUT | HWMON_I_LABEL | HWMON_I_LCRIT | HWMON_I_CRIT, HWMON_I_INPUT | HWMON_I_LABEL | HWMON_I_LCRIT | HWMON_I_CRIT, HWMON_I_INPUT | HWMON_I_LABEL | HWMON_I_LCRIT | HWMON_I_CRIT), HWMON_CHANNEL_INFO(curr, HWMON_C_INPUT | HWMON_C_LABEL, HWMON_C_INPUT | HWMON_C_LABEL | HWMON_C_CRIT, HWMON_C_INPUT | HWMON_C_LABEL | HWMON_C_CRIT, HWMON_C_INPUT | HWMON_C_LABEL | HWMON_C_CRIT), NULL }; static const struct hwmon_chip_info corsairpsu_chip_info = { .ops = &corsairpsu_hwmon_ops, .info = corsairpsu_info, }; #ifdef CONFIG_DEBUG_FS static void print_uptime(struct seq_file *seqf, u8 cmd) { struct corsairpsu_data *priv = seqf->private; long val; int ret; ret = corsairpsu_get_value(priv, cmd, 0, &val); if (ret < 0) { seq_puts(seqf, "N/A\n"); return; } if (val > SECONDS_PER_DAY) { seq_printf(seqf, "%ld day(s), %02ld:%02ld:%02ld\n", val / SECONDS_PER_DAY, val % SECONDS_PER_DAY / SECONDS_PER_HOUR, val % SECONDS_PER_HOUR / 60, val % 60); return; } seq_printf(seqf, "%02ld:%02ld:%02ld\n", val % SECONDS_PER_DAY / SECONDS_PER_HOUR, val % SECONDS_PER_HOUR / 60, val % 60); } static int uptime_show(struct seq_file *seqf, void *unused) { print_uptime(seqf, PSU_CMD_UPTIME); return 0; } DEFINE_SHOW_ATTRIBUTE(uptime); static int uptime_total_show(struct seq_file *seqf, void *unused) { print_uptime(seqf, PSU_CMD_TOTAL_UPTIME); return 0; } DEFINE_SHOW_ATTRIBUTE(uptime_total); static int vendor_show(struct seq_file *seqf, void *unused) { struct corsairpsu_data *priv = seqf->private; seq_printf(seqf, "%s\n", priv->vendor); return 0; } DEFINE_SHOW_ATTRIBUTE(vendor); static int product_show(struct seq_file *seqf, void *unused) { struct corsairpsu_data *priv = seqf->private; seq_printf(seqf, "%s\n", priv->product); return 0; } DEFINE_SHOW_ATTRIBUTE(product); static int ocpmode_show(struct seq_file *seqf, void *unused) { struct corsairpsu_data *priv = seqf->private; long val; int ret; /* * The rail mode is switchable on the fly. The RAW interface can be used for this. But it * will not be included here, because I consider it somewhat dangerous for the health of the * PSU. The returned value can be a bogus one, if the PSU is in the process of switching and * getting of the value itself can also fail during this. Because of this every other value * than OCP_MULTI_RAIL can be considered as "single rail". */ ret = corsairpsu_get_value(priv, PSU_CMD_OCPMODE, 0, &val); if (ret < 0) seq_puts(seqf, "N/A\n"); else seq_printf(seqf, "%s\n", (val == OCP_MULTI_RAIL) ? "multi rail" : "single rail"); return 0; } DEFINE_SHOW_ATTRIBUTE(ocpmode); static void corsairpsu_debugfs_init(struct corsairpsu_data *priv) { char name[32]; scnprintf(name, sizeof(name), "%s-%s", DRIVER_NAME, dev_name(&priv->hdev->dev)); priv->debugfs = debugfs_create_dir(name, NULL); debugfs_create_file("uptime", 0444, priv->debugfs, priv, &uptime_fops); debugfs_create_file("uptime_total", 0444, priv->debugfs, priv, &uptime_total_fops); debugfs_create_file("vendor", 0444, priv->debugfs, priv, &vendor_fops); debugfs_create_file("product", 0444, priv->debugfs, priv, &product_fops); debugfs_create_file("ocpmode", 0444, priv->debugfs, priv, &ocpmode_fops); } #else static void corsairpsu_debugfs_init(struct corsairpsu_data *priv) { } #endif static int corsairpsu_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct corsairpsu_data *priv; int ret; priv = devm_kzalloc(&hdev->dev, sizeof(struct corsairpsu_data), GFP_KERNEL); if (!priv) return -ENOMEM; priv->cmd_buffer = devm_kmalloc(&hdev->dev, CMD_BUFFER_SIZE, GFP_KERNEL); if (!priv->cmd_buffer) return -ENOMEM; ret = hid_parse(hdev); if (ret) return ret; ret = hid_hw_start(hdev, HID_CONNECT_HIDRAW); if (ret) return ret; ret = hid_hw_open(hdev); if (ret) goto fail_and_stop; priv->hdev = hdev; hid_set_drvdata(hdev, priv); mutex_init(&priv->lock); init_completion(&priv->wait_completion); hid_device_io_start(hdev); ret = corsairpsu_init(priv); if (ret < 0) { dev_err(&hdev->dev, "unable to initialize device (%d)\n", ret); goto fail_and_stop; } ret = corsairpsu_fwinfo(priv); if (ret < 0) { dev_err(&hdev->dev, "unable to query firmware (%d)\n", ret); goto fail_and_stop; } corsairpsu_get_criticals(priv); corsairpsu_check_cmd_support(priv); priv->hwmon_dev = hwmon_device_register_with_info(&hdev->dev, "corsairpsu", priv, &corsairpsu_chip_info, NULL); if (IS_ERR(priv->hwmon_dev)) { ret = PTR_ERR(priv->hwmon_dev); goto fail_and_close; } corsairpsu_debugfs_init(priv); return 0; fail_and_close: hid_hw_close(hdev); fail_and_stop: hid_hw_stop(hdev); return ret; } static void corsairpsu_remove(struct hid_device *hdev) { struct corsairpsu_data *priv = hid_get_drvdata(hdev); debugfs_remove_recursive(priv->debugfs); hwmon_device_unregister(priv->hwmon_dev); hid_hw_close(hdev); hid_hw_stop(hdev); } static int corsairpsu_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct corsairpsu_data *priv = hid_get_drvdata(hdev); if (completion_done(&priv->wait_completion)) return 0; memcpy(priv->cmd_buffer, data, min(CMD_BUFFER_SIZE, size)); complete(&priv->wait_completion); return 0; } #ifdef CONFIG_PM static int corsairpsu_resume(struct hid_device *hdev) { struct corsairpsu_data *priv = hid_get_drvdata(hdev); /* some PSUs turn off the microcontroller during standby, so a reinit is required */ return corsairpsu_init(priv); } #endif static const struct hid_device_id corsairpsu_idtable[] = { { HID_USB_DEVICE(0x1b1c, 0x1c03) }, /* Corsair HX550i */ { HID_USB_DEVICE(0x1b1c, 0x1c04) }, /* Corsair HX650i */ { HID_USB_DEVICE(0x1b1c, 0x1c05) }, /* Corsair HX750i */ { HID_USB_DEVICE(0x1b1c, 0x1c06) }, /* Corsair HX850i */ { HID_USB_DEVICE(0x1b1c, 0x1c07) }, /* Corsair HX1000i Legacy */ { HID_USB_DEVICE(0x1b1c, 0x1c08) }, /* Corsair HX1200i Legacy */ { HID_USB_DEVICE(0x1b1c, 0x1c09) }, /* Corsair RM550i */ { HID_USB_DEVICE(0x1b1c, 0x1c0a) }, /* Corsair RM650i */ { HID_USB_DEVICE(0x1b1c, 0x1c0b) }, /* Corsair RM750i */ { HID_USB_DEVICE(0x1b1c, 0x1c0c) }, /* Corsair RM850i */ { HID_USB_DEVICE(0x1b1c, 0x1c0d) }, /* Corsair RM1000i */ { HID_USB_DEVICE(0x1b1c, 0x1c1e) }, /* Corsair HX1000i Series 2023 */ { HID_USB_DEVICE(0x1b1c, 0x1c1f) }, /* Corsair HX1500i Legacy and Series 2023 */ { HID_USB_DEVICE(0x1b1c, 0x1c23) }, /* Corsair HX1200i Series 2023 */ { }, }; MODULE_DEVICE_TABLE(hid, corsairpsu_idtable); static struct hid_driver corsairpsu_driver = { .name = DRIVER_NAME, .id_table = corsairpsu_idtable, .probe = corsairpsu_probe, .remove = corsairpsu_remove, .raw_event = corsairpsu_raw_event, #ifdef CONFIG_PM .resume = corsairpsu_resume, .reset_resume = corsairpsu_resume, #endif }; static int __init corsair_init(void) { return hid_register_driver(&corsairpsu_driver); } static void __exit corsair_exit(void) { hid_unregister_driver(&corsairpsu_driver); } /* * With module_init() the driver would load before the HID bus when * built-in, so use late_initcall() instead. */ late_initcall(corsair_init); module_exit(corsair_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Wilken Gottwalt <wilken.gottwalt@posteo.net>"); MODULE_DESCRIPTION("Linux driver for Corsair power supplies with HID sensors interface"); |
| 11 11 5 5 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Roccat common functions for device specific drivers * * Copyright (c) 2011 Stefan Achatz <erazor_de@users.sourceforge.net> */ /* */ #include <linux/hid.h> #include <linux/slab.h> #include <linux/module.h> #include "hid-roccat-common.h" static inline uint16_t roccat_common2_feature_report(uint8_t report_id) { return 0x300 | report_id; } int roccat_common2_receive(struct usb_device *usb_dev, uint report_id, void *data, uint size) { char *buf; int len; buf = kmalloc(size, GFP_KERNEL); if (buf == NULL) return -ENOMEM; len = usb_control_msg(usb_dev, usb_rcvctrlpipe(usb_dev, 0), HID_REQ_GET_REPORT, USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_IN, roccat_common2_feature_report(report_id), 0, buf, size, USB_CTRL_SET_TIMEOUT); memcpy(data, buf, size); kfree(buf); return ((len < 0) ? len : ((len != size) ? -EIO : 0)); } EXPORT_SYMBOL_GPL(roccat_common2_receive); int roccat_common2_send(struct usb_device *usb_dev, uint report_id, void const *data, uint size) { char *buf; int len; buf = kmemdup(data, size, GFP_KERNEL); if (buf == NULL) return -ENOMEM; len = usb_control_msg(usb_dev, usb_sndctrlpipe(usb_dev, 0), HID_REQ_SET_REPORT, USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_OUT, roccat_common2_feature_report(report_id), 0, buf, size, USB_CTRL_SET_TIMEOUT); kfree(buf); return ((len < 0) ? len : ((len != size) ? -EIO : 0)); } EXPORT_SYMBOL_GPL(roccat_common2_send); enum roccat_common2_control_states { ROCCAT_COMMON_CONTROL_STATUS_CRITICAL = 0, ROCCAT_COMMON_CONTROL_STATUS_OK = 1, ROCCAT_COMMON_CONTROL_STATUS_INVALID = 2, ROCCAT_COMMON_CONTROL_STATUS_BUSY = 3, ROCCAT_COMMON_CONTROL_STATUS_CRITICAL_NEW = 4, }; static int roccat_common2_receive_control_status(struct usb_device *usb_dev) { int retval; struct roccat_common2_control control; do { msleep(50); retval = roccat_common2_receive(usb_dev, ROCCAT_COMMON_COMMAND_CONTROL, &control, sizeof(struct roccat_common2_control)); if (retval) return retval; switch (control.value) { case ROCCAT_COMMON_CONTROL_STATUS_OK: return 0; case ROCCAT_COMMON_CONTROL_STATUS_BUSY: msleep(500); continue; case ROCCAT_COMMON_CONTROL_STATUS_INVALID: case ROCCAT_COMMON_CONTROL_STATUS_CRITICAL: case ROCCAT_COMMON_CONTROL_STATUS_CRITICAL_NEW: return -EINVAL; default: dev_err(&usb_dev->dev, "roccat_common2_receive_control_status: " "unknown response value 0x%x\n", control.value); return -EINVAL; } } while (1); } int roccat_common2_send_with_status(struct usb_device *usb_dev, uint command, void const *buf, uint size) { int retval; retval = roccat_common2_send(usb_dev, command, buf, size); if (retval) return retval; msleep(100); return roccat_common2_receive_control_status(usb_dev); } EXPORT_SYMBOL_GPL(roccat_common2_send_with_status); int roccat_common2_device_init_struct(struct usb_device *usb_dev, struct roccat_common2_device *dev) { mutex_init(&dev->lock); return 0; } EXPORT_SYMBOL_GPL(roccat_common2_device_init_struct); ssize_t roccat_common2_sysfs_read(struct file *fp, struct kobject *kobj, char *buf, loff_t off, size_t count, size_t real_size, uint command) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct roccat_common2_device *roccat_dev = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval; if (off >= real_size) return 0; if (off != 0 || count != real_size) return -EINVAL; mutex_lock(&roccat_dev->lock); retval = roccat_common2_receive(usb_dev, command, buf, real_size); mutex_unlock(&roccat_dev->lock); return retval ? retval : real_size; } EXPORT_SYMBOL_GPL(roccat_common2_sysfs_read); ssize_t roccat_common2_sysfs_write(struct file *fp, struct kobject *kobj, void const *buf, loff_t off, size_t count, size_t real_size, uint command) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct roccat_common2_device *roccat_dev = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval; if (off != 0 || count != real_size) return -EINVAL; mutex_lock(&roccat_dev->lock); retval = roccat_common2_send_with_status(usb_dev, command, buf, real_size); mutex_unlock(&roccat_dev->lock); return retval ? retval : real_size; } EXPORT_SYMBOL_GPL(roccat_common2_sysfs_write); MODULE_AUTHOR("Stefan Achatz"); MODULE_DESCRIPTION("USB Roccat common driver"); MODULE_LICENSE("GPL v2"); |
| 910 909 907 1 1 767 330 125 1 109 107 295 34 874 875 872 639 476 477 877 877 871 2 2 1 50 2 915 52 114 872 913 915 917 913 914 908 909 2 894 7 576 577 576 562 14 558 20 19 559 10 4 10 566 11 576 574 574 573 576 572 576 574 23 23 23 23 23 100 100 100 100 100 28 29 29 29 28 29 29 29 29 28 21 71 71 71 71 64 71 116 116 116 5 112 101 10 6 2 100 53 47 1 45 31 17 36 31 29 26 29 26 17 67 4 67 63 23 46 1 140 100 1 46 44 12 1104 745 738 739 739 739 737 17 601 329 604 903 372 960 959 957 954 927 26 71 957 112 142 948 376 877 25 139 140 140 107 93 13 64 47 69 371 375 376 317 63 64 64 64 64 64 376 376 375 409 52 375 407 408 1 1 405 3 408 6 333 81 55 1 1 39 4 16 2 5 10 348 7 394 5 399 389 70 41 376 372 377 42 348 2 155 262 373 372 4 373 215 372 337 39 258 156 61 13 8 3 56 56 55 56 52 3 252 185 20 329 331 13 53 283 284 320 414 347 384 38 367 2 357 10 366 67 366 366 355 13 60 363 366 362 365 366 263 145 145 145 362 362 362 346 16 362 269 269 1 267 61 61 20 45 46 107 107 6 1 100 15 94 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 output functions * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/net/ipv4/ip_output.c * * Changes: * A.N.Kuznetsov : airthmetics in fragmentation. * extension headers are implemented. * route changes now work. * ip6_forward does not confuse sniffers. * etc. * * H. von Brand : Added missing #include <linux/string.h> * Imran Patel : frag id should be in NBO * Kazunori MIYAZAWA @USAGI * : add ip6_append_data and related functions * for datagram xmit */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/in6.h> #include <linux/tcp.h> #include <linux/route.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/bpf-cgroup.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <net/sock.h> #include <net/snmp.h> #include <net/gso.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/rawv6.h> #include <net/icmp.h> #include <net/xfrm.h> #include <net/checksum.h> #include <linux/mroute6.h> #include <net/l3mdev.h> #include <net/lwtunnel.h> #include <net/ip_tunnels.h> static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; struct inet6_dev *idev = ip6_dst_idev(dst); unsigned int hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *daddr, *nexthop; struct ipv6hdr *hdr; struct neighbour *neigh; int ret; /* Be paranoid, rather than too clever. */ if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { /* Make sure idev stays alive */ rcu_read_lock(); skb = skb_expand_head(skb, hh_len); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return -ENOMEM; } rcu_read_unlock(); } hdr = ipv6_hdr(skb); daddr = &hdr->daddr; if (ipv6_addr_is_multicast(daddr)) { if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && ((mroute6_is_socket(net, skb) && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); /* Do not check for IFF_ALLMULTI; multicast routing is not supported in any case. */ if (newskb) NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, dev_loopback_xmit); if (hdr->hop_limit == 0) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return 0; } } IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && !(dev->flags & IFF_LOOPBACK)) { kfree_skb(skb); return 0; } } if (lwtunnel_xmit_redirect(dst->lwtstate)) { int res = lwtunnel_xmit(skb); if (res != LWTUNNEL_XMIT_CONTINUE) return res; } IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); rcu_read_lock(); nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); neigh = __ipv6_neigh_lookup_noref(dev, nexthop); if (IS_ERR_OR_NULL(neigh)) { if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dev, false); if (IS_ERR(neigh)) { rcu_read_unlock(); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); return -EINVAL; } } sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); rcu_read_unlock(); return ret; } static int ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu) { struct sk_buff *segs, *nskb; netdev_features_t features; int ret = 0; /* Please see corresponding comment in ip_finish_output_gso * describing the cases where GSO segment length exceeds the * egress MTU. */ features = netif_skb_features(skb); segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { kfree_skb(skb); return -ENOMEM; } consume_skb(skb); skb_list_walk_safe(segs, segs, nskb) { int err; skb_mark_not_on_list(segs); /* Last GSO segment can be smaller than gso_size (and MTU). * Adding a fragment header would produce an "atomic fragment", * which is considered harmful (RFC-8021). Avoid that. */ err = segs->len > mtu ? ip6_fragment(net, sk, segs, ip6_finish_output2) : ip6_finish_output2(net, sk, segs); if (err && ret == 0) ret = err; } return ret; } static int ip6_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu) { if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && !skb_gso_validate_network_len(skb, mtu)) return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); return ip6_finish_output2(net, sk, skb); } static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { IP6CB(skb)->flags |= IP6SKB_REROUTED; return dst_output(net, sk, skb); } #endif mtu = ip6_skb_dst_mtu(skb); if (skb_is_gso(skb)) return ip6_finish_output_gso(net, sk, skb, mtu); if (skb->len > mtu || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) return ip6_fragment(net, sk, skb, ip6_finish_output2); return ip6_finish_output2(net, sk, skb); } static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { int ret; ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); switch (ret) { case NET_XMIT_SUCCESS: case NET_XMIT_CN: return __ip6_finish_output(net, sk, skb) ? : ret; default: kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); return ret; } } int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); return 0; } return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, indev, dev, ip6_finish_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } EXPORT_SYMBOL(ip6_output); bool ip6_autoflowlabel(struct net *net, const struct sock *sk) { if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk)) return ip6_default_np_autolabel(net); return inet6_test_bit(AUTOFLOWLABEL, sk); } /* * xmit an sk_buff (used by TCP, SCTP and DCCP) * Note : socket lock is not held for SYNACK packets, but might be modified * by calls to skb_set_owner_w() and ipv6_local_error(), * which are using proper atomic operations or spinlocks. */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) { struct net *net = sock_net(sk); const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; struct inet6_dev *idev = ip6_dst_idev(dst); struct hop_jumbo_hdr *hop_jumbo; int hoplen = sizeof(*hop_jumbo); unsigned int head_room; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; int seg_len = skb->len; int hlimit = -1; u32 mtu; head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); if (opt) head_room += opt->opt_nflen + opt->opt_flen; if (unlikely(head_room > skb_headroom(skb))) { /* Make sure idev stays alive */ rcu_read_lock(); skb = skb_expand_head(skb, head_room); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return -ENOBUFS; } rcu_read_unlock(); } if (opt) { seg_len += opt->opt_nflen + opt->opt_flen; if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); if (opt->opt_nflen) ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, &fl6->saddr); } if (unlikely(seg_len > IPV6_MAXPLEN)) { hop_jumbo = skb_push(skb, hoplen); hop_jumbo->nexthdr = proto; hop_jumbo->hdrlen = 0; hop_jumbo->tlv_type = IPV6_TLV_JUMBO; hop_jumbo->tlv_len = 4; hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen); proto = IPPROTO_HOPOPTS; seg_len = 0; IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO; } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); /* * Fill in the IPv6 header */ if (np) hlimit = READ_ONCE(np->hop_limit); if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, ip6_autoflowlabel(net, sk), fl6)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; hdr->hop_limit = hlimit; hdr->saddr = fl6->saddr; hdr->daddr = *first_hop; skb->protocol = htons(ETH_P_IPV6); skb->priority = priority; skb->mark = mark; mtu = dst_mtu(dst); if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip6_out((struct sock *)sk, skb); if (unlikely(!skb)) return 0; /* hooks should never assume socket lock is held. * we promote our socket to non const */ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, (struct sock *)sk, skb, NULL, dev, dst_output); } skb->dev = dev; /* ipv6_local_error() does not require socket lock, * we promote our socket to non const */ ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } EXPORT_SYMBOL(ip6_xmit); static int ip6_call_ra_chain(struct sk_buff *skb, int sel) { struct ip6_ra_chain *ra; struct sock *last = NULL; read_lock(&ip6_ra_lock); for (ra = ip6_ra_chain; ra; ra = ra->next) { struct sock *sk = ra->sk; if (sk && ra->sel == sel && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == skb->dev->ifindex)) { if (inet6_test_bit(RTALERT_ISOLATE, sk) && !net_eq(sock_net(sk), dev_net(skb->dev))) { continue; } if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) rawv6_rcv(last, skb2); } last = sk; } } if (last) { rawv6_rcv(last, skb); read_unlock(&ip6_ra_lock); return 1; } read_unlock(&ip6_ra_lock); return 0; } static int ip6_forward_proxy_check(struct sk_buff *skb) { struct ipv6hdr *hdr = ipv6_hdr(skb); u8 nexthdr = hdr->nexthdr; __be16 frag_off; int offset; if (ipv6_ext_hdr(nexthdr)) { offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); if (offset < 0) return 0; } else offset = sizeof(struct ipv6hdr); if (nexthdr == IPPROTO_ICMPV6) { struct icmp6hdr *icmp6; if (!pskb_may_pull(skb, (skb_network_header(skb) + offset + 1 - skb->data))) return 0; icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); switch (icmp6->icmp6_type) { case NDISC_ROUTER_SOLICITATION: case NDISC_ROUTER_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: /* For reaction involving unicast neighbor discovery * message destined to the proxied address, pass it to * input function. */ return 1; default: break; } } /* * The proxying router can't forward traffic sent to a link-local * address, so signal the sender and discard the packet. This * behavior is clarified by the MIPv6 specification. */ if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { dst_link_failure(skb); return -1; } return 0; } static inline int ip6_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { #ifdef CONFIG_NET_SWITCHDEV if (skb->offload_l3_fwd_mark) { consume_skb(skb); return 0; } #endif skb_clear_tstamp(skb); return dst_output(net, sk, skb); } static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) return true; if (skb->ignore_df) return false; if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; } int ip6_forward(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(dst->dev); struct inet6_dev *idev; SKB_DR(reason); u32 mtu; idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0) goto error; if (skb->pkt_type != PACKET_HOST) goto drop; if (unlikely(skb->sk)) goto drop; if (skb_warn_if_lro(skb)) goto drop; if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) && (!idev || !READ_ONCE(idev->cnf.disable_policy)) && !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } skb_forward_csum(skb); /* * We DO NOT make any processing on * RA packets, pushing them to user level AS IS * without ane WARRANTY that application will be able * to interpret them. The reason is that we * cannot make anything clever here. * * We are not end-node, so that if packet contains * AH/ESP, we cannot make anything. * Defragmentation also would be mistake, RA packets * cannot be fragmented, because there is no warranty * that different fragments will go along one path. --ANK */ if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { if (ip6_call_ra_chain(skb, ntohs(opt->ra))) return 0; } /* * check and decrement ttl */ if (hdr->hop_limit <= 1) { icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); return -ETIMEDOUT; } /* XXX: idev->cnf.proxy_ndp? */ if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) { /* It's tempting to decrease the hop limit * here by 1, as we do at the end of the * function too. * * But that would be incorrect, as proxying is * not forwarding. The ip6_input function * will handle this packet locally, and it * depends on the hop limit being unchanged. * * One example is the NDP hop limit, that * always has to stay 255, but other would be * similar checks around RA packets, where the * user can even change the desired limit. */ return ip6_input(skb); } else if (proxied < 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } } if (!xfrm6_route_forward(skb)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); SKB_DR_SET(reason, XFRM_POLICY); goto drop; } dst = skb_dst(skb); /* IPv6 specs say nothing about it, but it is clear that we cannot send redirects to source routed frames. We don't send redirects to frames decapsulated from IPsec. */ if (IP6CB(skb)->iif == dst->dev->ifindex && opt->srcrt == 0 && !skb_sec_path(skb)) { struct in6_addr *target = NULL; struct inet_peer *peer; struct rt6_info *rt; /* * incoming and outgoing devices are the same * send a redirect. */ rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_GATEWAY) target = &rt->rt6i_gateway; else target = &hdr->daddr; rcu_read_lock(); peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); /* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect) */ if (inet_peer_xrlim_allow(peer, 1*HZ)) ndisc_send_redirect(skb, target); rcu_read_unlock(); } else { int addrtype = ipv6_addr_type(&hdr->saddr); /* This check is security critical. */ if (addrtype == IPV6_ADDR_ANY || addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) goto error; if (addrtype & IPV6_ADDR_LINKLOCAL) { icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOT_NEIGHBOUR, 0); goto error; } } __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); mtu = ip6_dst_mtu_maybe_forward(dst, true); if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if (ip6_pkt_too_big(skb, mtu)) { /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); return -EMSGSIZE; } if (skb_cow(skb, dst->dev->hard_header_len)) { __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); goto drop; } hdr = ipv6_hdr(skb); /* Mangling hops number delayed to point after skb COW */ hdr->hop_limit--; return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, net, NULL, skb, skb->dev, dst->dev, ip6_forward_finish); error: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); SKB_DR_SET(reason, IP_INADDRERRORS); drop: kfree_skb_reason(skb, reason); return -EINVAL; } static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) { to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; skb_dst_drop(to); skb_dst_set(to, dst_clone(skb_dst(from))); to->dev = from->dev; to->mark = from->mark; skb_copy_hash(to, from); #ifdef CONFIG_NET_SCHED to->tc_index = from->tc_index; #endif nf_copy(to, from); skb_ext_copy(to, from); skb_copy_secmark(to, from); } int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, u8 nexthdr, __be32 frag_id, struct ip6_fraglist_iter *iter) { unsigned int first_len; struct frag_hdr *fh; /* BUILD HEADER */ *prevhdr = NEXTHDR_FRAGMENT; iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); if (!iter->tmp_hdr) return -ENOMEM; iter->frag = skb_shinfo(skb)->frag_list; skb_frag_list_init(skb); iter->offset = 0; iter->hlen = hlen; iter->frag_id = frag_id; iter->nexthdr = nexthdr; __skb_pull(skb, hlen); fh = __skb_push(skb, sizeof(struct frag_hdr)); __skb_push(skb, hlen); skb_reset_network_header(skb); memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); fh->nexthdr = nexthdr; fh->reserved = 0; fh->frag_off = htons(IP6_MF); fh->identification = frag_id; first_len = skb_pagelen(skb); skb->data_len = first_len - skb_headlen(skb); skb->len = first_len; ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); return 0; } EXPORT_SYMBOL(ip6_fraglist_init); void ip6_fraglist_prepare(struct sk_buff *skb, struct ip6_fraglist_iter *iter) { struct sk_buff *frag = iter->frag; unsigned int hlen = iter->hlen; struct frag_hdr *fh; frag->ip_summed = CHECKSUM_NONE; skb_reset_transport_header(frag); fh = __skb_push(frag, sizeof(struct frag_hdr)); __skb_push(frag, hlen); skb_reset_network_header(frag); memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); iter->offset += skb->len - hlen - sizeof(struct frag_hdr); fh->nexthdr = iter->nexthdr; fh->reserved = 0; fh->frag_off = htons(iter->offset); if (frag->next) fh->frag_off |= htons(IP6_MF); fh->identification = iter->frag_id; ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); ip6_copy_metadata(frag, skb); } EXPORT_SYMBOL(ip6_fraglist_prepare); void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) { state->prevhdr = prevhdr; state->nexthdr = nexthdr; state->frag_id = frag_id; state->hlen = hlen; state->mtu = mtu; state->left = skb->len - hlen; /* Space per frame */ state->ptr = hlen; /* Where to start from */ state->hroom = hdr_room; state->troom = needed_tailroom; state->offset = 0; } EXPORT_SYMBOL(ip6_frag_init); struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) { u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; struct sk_buff *frag; struct frag_hdr *fh; unsigned int len; len = state->left; /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > state->mtu) len = state->mtu; /* IF: we are not sending up to and including the packet end then align the next start on an eight byte boundary */ if (len < state->left) len &= ~7; /* Allocate buffer */ frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + state->hroom + state->troom, GFP_ATOMIC); if (!frag) return ERR_PTR(-ENOMEM); /* * Set up data on packet */ ip6_copy_metadata(frag, skb); skb_reserve(frag, state->hroom); skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); skb_reset_network_header(frag); fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); frag->transport_header = (frag->network_header + state->hlen + sizeof(struct frag_hdr)); /* * Charge the memory for the fragment to any owner * it might possess */ if (skb->sk) skb_set_owner_w(frag, skb->sk); /* * Copy the packet header into the new buffer. */ skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); fragnexthdr_offset = skb_network_header(frag); fragnexthdr_offset += prevhdr - skb_network_header(skb); *fragnexthdr_offset = NEXTHDR_FRAGMENT; /* * Build fragment header. */ fh->nexthdr = state->nexthdr; fh->reserved = 0; fh->identification = state->frag_id; /* * Copy a block of the IP datagram. */ BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), len)); state->left -= len; fh->frag_off = htons(state->offset); if (state->left > 0) fh->frag_off |= htons(IP6_MF); ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); state->ptr += len; state->offset += len; return frag; } EXPORT_SYMBOL(ip6_frag_next); int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct sk_buff *frag; struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; u8 tstamp_type = skb->tstamp_type; struct ip6_frag_state state; unsigned int mtu, hlen, nexthdr_offset; ktime_t tstamp = skb->tstamp; int hroom, err = 0; __be32 frag_id; u8 *prevhdr, nexthdr = 0; err = ip6_find_1stfragopt(skb, &prevhdr); if (err < 0) goto fail; hlen = err; nexthdr = *prevhdr; nexthdr_offset = prevhdr - skb_network_header(skb); mtu = ip6_skb_dst_mtu(skb); /* We must not fragment if the socket is set to force MTU discovery * or if the skb it not generated by a local socket. */ if (unlikely(!skb->ignore_df && skb->len > mtu)) goto fail_toobig; if (IP6CB(skb)->frag_max_size) { if (IP6CB(skb)->frag_max_size > mtu) goto fail_toobig; /* don't send fragments larger than what we received */ mtu = IP6CB(skb)->frag_max_size; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; } if (np) { u32 frag_size = READ_ONCE(np->frag_size); if (frag_size && frag_size < mtu) mtu = frag_size; } if (mtu < hlen + sizeof(struct frag_hdr) + 8) goto fail_toobig; mtu -= hlen + sizeof(struct frag_hdr); frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, &ipv6_hdr(skb)->saddr); if (skb->ip_summed == CHECKSUM_PARTIAL && (err = skb_checksum_help(skb))) goto fail; prevhdr = skb_network_header(skb) + nexthdr_offset; hroom = LL_RESERVED_SPACE(rt->dst.dev); if (skb_has_frag_list(skb)) { unsigned int first_len = skb_pagelen(skb); struct ip6_fraglist_iter iter; struct sk_buff *frag2; if (first_len - hlen > mtu || ((first_len - hlen) & 7) || skb_cloned(skb) || skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) goto slow_path; skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) goto slow_path_clean; /* Partially cloned skb? */ if (skb_shared(frag)) goto slow_path_clean; BUG_ON(frag->sk); if (skb->sk) { frag->sk = skb->sk; frag->destructor = sock_wfree; } skb->truesize -= frag->truesize; } err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, &iter); if (err < 0) goto fail; /* We prevent @rt from being freed. */ rcu_read_lock(); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ if (iter.frag) ip6_fraglist_prepare(skb, &iter); skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); if (err || !iter.frag) break; skb = ip6_fraglist_next(&iter); } kfree(iter.tmp_hdr); if (err == 0) { IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGOKS); rcu_read_unlock(); return 0; } kfree_skb_list(iter.frag); IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); rcu_read_unlock(); return err; slow_path_clean: skb_walk_frags(skb, frag2) { if (frag2 == frag) break; frag2->sk = NULL; frag2->destructor = NULL; skb->truesize += frag2->truesize; } } slow_path: /* * Fragment the datagram. */ ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, &state); /* * Keep copying data until we run out. */ while (state.left > 0) { frag = ip6_frag_next(skb, &state); if (IS_ERR(frag)) { err = PTR_ERR(frag); goto fail; } /* * Put this fragment into the sending queue. */ skb_set_delivery_time(frag, tstamp, tstamp_type); err = output(net, sk, frag); if (err) goto fail; IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGCREATES); } IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGOKS); consume_skb(skb); return err; fail_toobig: icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); err = -EMSGSIZE; fail: IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return err; } static inline int ip6_rt_check(const struct rt6key *rt_key, const struct in6_addr *fl_addr, const struct in6_addr *addr_cache) { return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); } static struct dst_entry *ip6_sk_dst_check(struct sock *sk, struct dst_entry *dst, const struct flowi6 *fl6) { struct ipv6_pinfo *np = inet6_sk(sk); struct rt6_info *rt; if (!dst) goto out; if (dst->ops->family != AF_INET6) { dst_release(dst); return NULL; } rt = dst_rt6_info(dst); /* Yes, checking route validity in not connected * case is not very simple. Take into account, * that we do not support routing by source, TOS, * and MSG_DONTROUTE --ANK (980726) * * 1. ip6_rt_check(): If route was host route, * check that cached destination is current. * If it is network route, we still may * check its validity using saved pointer * to the last used address: daddr_cache. * We do not want to save whole address now, * (because main consumer of this service * is tcp, which has not this problem), * so that the last trick works only on connected * sockets. * 2. oif also should be the same. */ if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || #ifdef CONFIG_IPV6_SUBTREES ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || #endif (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { dst_release(dst); dst = NULL; } out: return dst; } static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) { #ifdef CONFIG_IPV6_OPTIMISTIC_DAD struct neighbour *n; struct rt6_info *rt; #endif int err; int flags = 0; /* The correct way to handle this would be to do * ip6_route_get_saddr, and then ip6_route_output; however, * the route-specific preferred source forces the * ip6_route_output call _before_ ip6_route_get_saddr. * * In source specific routing (no src=any default route), * ip6_route_output will fail given src=any saddr, though, so * that's why we try it again later. */ if (ipv6_addr_any(&fl6->saddr)) { struct fib6_info *from; struct rt6_info *rt; *dst = ip6_route_output(net, sk, fl6); rt = (*dst)->error ? NULL : dst_rt6_info(*dst); rcu_read_lock(); from = rt ? rcu_dereference(rt->from) : NULL; err = ip6_route_get_saddr(net, from, &fl6->daddr, sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, fl6->flowi6_l3mdev, &fl6->saddr); rcu_read_unlock(); if (err) goto out_err_release; /* If we had an erroneous initial result, pretend it * never existed and let the SA-enabled version take * over. */ if ((*dst)->error) { dst_release(*dst); *dst = NULL; } if (fl6->flowi6_oif) flags |= RT6_LOOKUP_F_IFACE; } if (!*dst) *dst = ip6_route_output_flags(net, sk, fl6, flags); err = (*dst)->error; if (err) goto out_err_release; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD /* * Here if the dst entry we've looked up * has a neighbour entry that is in the INCOMPLETE * state and the src address from the flow is * marked as OPTIMISTIC, we release the found * dst entry and replace it instead with the * dst entry of the nexthop router */ rt = dst_rt6_info(*dst); rcu_read_lock(); n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr)); err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; rcu_read_unlock(); if (err) { struct inet6_ifaddr *ifp; struct flowi6 fl_gw6; int redirect; ifp = ipv6_get_ifaddr(net, &fl6->saddr, (*dst)->dev, 1); redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); if (ifp) in6_ifa_put(ifp); if (redirect) { /* * We need to get the dst entry for the * default router instead */ dst_release(*dst); memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); *dst = ip6_route_output(net, sk, &fl_gw6); err = (*dst)->error; if (err) goto out_err_release; } } #endif if (ipv6_addr_v4mapped(&fl6->saddr) && !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { err = -EAFNOSUPPORT; goto out_err_release; } return 0; out_err_release: dst_release(*dst); *dst = NULL; if (err == -ENETUNREACH) IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); return err; } /** * ip6_dst_lookup - perform route lookup on flow * @net: Network namespace to perform lookup in * @sk: socket which provides route info * @dst: pointer to dst_entry * for result * @fl6: flow to lookup * * This function performs a route lookup on the given flow. * * It returns zero on success, or a standard errno code on error. */ int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) { *dst = NULL; return ip6_dst_lookup_tail(net, sk, dst, fl6); } EXPORT_SYMBOL_GPL(ip6_dst_lookup); /** * ip6_dst_lookup_flow - perform route lookup on flow with ipsec * @net: Network namespace to perform lookup in * @sk: socket which provides route info * @fl6: flow to lookup * @final_dst: final destination address for ipsec lookup * * This function performs a route lookup on the given flow. * * It returns a valid dst pointer on success, or a pointer encoded * error code. */ struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst) { struct dst_entry *dst = NULL; int err; err = ip6_dst_lookup_tail(net, sk, &dst, fl6); if (err) return ERR_PTR(err); if (final_dst) fl6->daddr = *final_dst; return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); } EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); /** * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow * @sk: socket which provides the dst cache and route info * @fl6: flow to lookup * @final_dst: final destination address for ipsec lookup * @connected: whether @sk is connected or not * * This function performs a route lookup on the given flow with the * possibility of using the cached route in the socket if it is valid. * It will take the socket dst lock when operating on the dst cache. * As a result, this function can only be used in process context. * * In addition, for a connected socket, cache the dst in the socket * if the current cache is not valid. * * It returns a valid dst pointer on success, or a pointer encoded * error code. */ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst, bool connected) { struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); dst = ip6_sk_dst_check(sk, dst, fl6); if (dst) return dst; dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); if (connected && !IS_ERR(dst)) ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); return dst; } EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, gfp_t gfp) { return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; } static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, gfp_t gfp) { return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; } static void ip6_append_data_mtu(unsigned int *mtu, int *maxfraglen, unsigned int fragheaderlen, struct sk_buff *skb, struct rt6_info *rt, unsigned int orig_mtu) { if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { if (!skb) { /* first fragment, reserve header_len */ *mtu = orig_mtu - rt->dst.header_len; } else { /* * this fragment is not first, the headers * space is regarded as data space. */ *mtu = orig_mtu; } *maxfraglen = ((*mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); } } static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, struct rt6_info *rt) { struct ipv6_pinfo *np = inet6_sk(sk); unsigned int mtu, frag_size; struct ipv6_txoptions *nopt, *opt = ipc6->opt; /* callers pass dst together with a reference, set it first so * ip6_cork_release() can put it down even in case of an error. */ cork->base.dst = &rt->dst; /* * setup for corking */ if (opt) { if (WARN_ON(v6_cork->opt)) return -EINVAL; nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); if (unlikely(!nopt)) return -ENOBUFS; nopt->tot_len = sizeof(*opt); nopt->opt_flen = opt->opt_flen; nopt->opt_nflen = opt->opt_nflen; nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); if (opt->dst0opt && !nopt->dst0opt) return -ENOBUFS; nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); if (opt->dst1opt && !nopt->dst1opt) return -ENOBUFS; nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); if (opt->hopopt && !nopt->hopopt) return -ENOBUFS; nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); if (opt->srcrt && !nopt->srcrt) return -ENOBUFS; /* need source address above miyazawa*/ } v6_cork->hop_limit = ipc6->hlimit; v6_cork->tclass = ipc6->tclass; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); else mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); frag_size = READ_ONCE(np->frag_size); if (frag_size && frag_size < mtu) mtu = frag_size; cork->base.fragsize = mtu; cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; cork->base.mark = ipc6->sockc.mark; cork->base.priority = ipc6->sockc.priority; sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { cork->base.flags |= IPCORK_TS_OPT_ID; cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; } cork->base.length = 0; cork->base.transmit_time = ipc6->sockc.transmit_time; return 0; } static int __ip6_append_data(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork_full, struct inet6_cork *v6_cork, struct page_frag *pfrag, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, unsigned int flags, struct ipcm6_cookie *ipc6) { struct sk_buff *skb, *skb_prev = NULL; struct inet_cork *cork = &cork_full->base; struct flowi6 *fl6 = &cork_full->fl.u.ip6; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; struct ubuf_info *uarg = NULL; int exthdrlen = 0; int dst_exthdrlen = 0; int hh_len; int copy; int err; int offset = 0; bool zc = false; u32 tskey = 0; struct rt6_info *rt = dst_rt6_info(cork->dst); bool paged, hold_tskey = false, extra_uref = false; struct ipv6_txoptions *opt = v6_cork->opt; int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; unsigned int wmem_alloc_delta = 0; skb = skb_peek_tail(queue); if (!skb) { exthdrlen = opt ? opt->opt_flen : 0; dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; } paged = !!cork->gso_size; mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; orig_mtu = mtu; hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + (opt ? opt->opt_nflen : 0); headersize = sizeof(struct ipv6hdr) + (opt ? opt->opt_flen + opt->opt_nflen : 0) + rt->rt6i_nfheader_len; if (mtu <= fragheaderlen || ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) goto emsgsize; maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit * the first fragment */ if (headersize + transhdrlen > mtu) goto emsgsize; if (cork->length + length > mtu - headersize && ipc6->dontfrag && (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_ICMPV6 || sk->sk_protocol == IPPROTO_RAW)) { ipv6_local_rxpmtu(sk, fl6, mtu - headersize + sizeof(struct ipv6hdr)); goto emsgsize; } if (ip6_sk_ignore_df(sk)) maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; else maxnonfragsize = mtu; if (cork->length + length > maxnonfragsize - headersize) { emsgsize: pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); return -EMSGSIZE; } /* CHECKSUM_PARTIAL only with no extension headers and when * we are not going to fragment */ if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && headersize == sizeof(struct ipv6hdr) && length <= mtu - headersize && (!(flags & MSG_MORE) || cork->gso_size) && rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; if ((flags & MSG_ZEROCOPY) && length) { struct msghdr *msg = from; if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) return -EINVAL; /* Leave uarg NULL if can't zerocopy, callers should * be able to handle it. */ if ((rt->dst.dev->features & NETIF_F_SG) && csummode == CHECKSUM_PARTIAL) { paged = true; zc = true; uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ if (rt->dst.dev->features & NETIF_F_SG && csummode == CHECKSUM_PARTIAL) { paged = true; zc = true; } else { uarg_to_msgzc(uarg)->zerocopy = 0; skb_zcopy_set(skb, uarg, &extra_uref); } } } else if ((flags & MSG_SPLICE_PAGES) && length) { if (inet_test_bit(HDRINCL, sk)) return -EPERM; if (rt->dst.dev->features & NETIF_F_SG && getfrag == ip_generic_getfrag) /* We need an empty buffer to attach stuff to */ paged = true; else flags &= ~MSG_SPLICE_PAGES; } if (cork->tx_flags & SKBTX_ANY_TSTAMP && READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { if (cork->flags & IPCORK_TS_OPT_ID) { tskey = cork->ts_opt_id; } else { tskey = atomic_inc_return(&sk->sk_tskey) - 1; hold_tskey = true; } } /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. * Otherwise, we need to reserve fragment header and * fragment alignment (= 8-15 octects, in total). * * Note that we may need to "move" the data from the tail * of the buffer to the new fragment when we split * the message. * * FIXME: It may be fragmented into multiple chunks * at once if non-fragmentable extension headers * are too large. * --yoshfuji */ cork->length += length; if (!skb) goto alloc_new_skb; while (length > 0) { /* Check if the remaining data fits into current packet. */ copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len; if (copy < length) copy = maxfraglen - skb->len; if (copy <= 0) { char *data; unsigned int datalen; unsigned int fraglen; unsigned int fraggap; unsigned int alloclen, alloc_extra; unsigned int pagedlen; alloc_new_skb: /* There's no room in the current skb */ if (skb) fraggap = skb->len - maxfraglen; else fraggap = 0; /* update mtu and maxfraglen if necessary */ if (!skb || !skb_prev) ip6_append_data_mtu(&mtu, &maxfraglen, fragheaderlen, skb, rt, orig_mtu); skb_prev = skb; /* * If remaining data exceeds the mtu, * we know we need more fragment(s). */ datalen = length + fraggap; if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen) datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; fraglen = datalen + fragheaderlen; pagedlen = 0; alloc_extra = hh_len; alloc_extra += dst_exthdrlen; alloc_extra += rt->dst.trailer_len; /* We just reserve space for fragment header. * Note: this may be overallocation if the message * (without MSG_MORE) fits into the MTU. */ alloc_extra += sizeof(struct frag_hdr); if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; else if (!paged && (fraglen + alloc_extra < SKB_MAX_ALLOC || !(rt->dst.dev->features & NETIF_F_SG))) alloclen = fraglen; else { alloclen = fragheaderlen + transhdrlen; pagedlen = datalen - transhdrlen; } alloclen += alloc_extra; if (datalen != length + fraggap) { /* * this is not the last fragment, the trailer * space is regarded as data space. */ datalen += rt->dst.trailer_len; } fraglen = datalen + fragheaderlen; copy = datalen - transhdrlen - fraggap - pagedlen; /* [!] NOTE: copy may be negative if pagedlen>0 * because then the equation may reduces to -fraggap. */ if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { err = -EINVAL; goto error; } if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen, (flags & MSG_DONTWAIT), &err); } else { skb = NULL; if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 2 * sk->sk_sndbuf) skb = alloc_skb(alloclen, sk->sk_allocation); if (unlikely(!skb)) err = -ENOBUFS; } if (!skb) goto error; /* * Fill in the control structures */ skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = csummode; skb->csum = 0; /* reserve for fragmentation and ipsec header */ skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + dst_exthdrlen); /* * Find where to start putting bytes */ data = skb_put(skb, fraglen - pagedlen); skb_set_network_header(skb, exthdrlen); data += fragheaderlen; skb->transport_header = (skb->network_header + fragheaderlen); if (fraggap) { skb->csum = skb_copy_and_csum_bits( skb_prev, maxfraglen, data + transhdrlen, fraggap); skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); data += fraggap; pskb_trim_unique(skb_prev, maxfraglen); } if (copy > 0 && INDIRECT_CALL_1(getfrag, ip_generic_getfrag, from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; } else if (flags & MSG_SPLICE_PAGES) { copy = 0; } offset += copy; length -= copy + transhdrlen; transhdrlen = 0; exthdrlen = 0; dst_exthdrlen = 0; /* Only the initial fragment is time stamped */ skb_shinfo(skb)->tx_flags = cork->tx_flags; cork->tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; skb_zcopy_set(skb, uarg, &extra_uref); if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); /* * Put the packet on the pending queue */ if (!skb->destructor) { skb->destructor = sock_wfree; skb->sk = sk; wmem_alloc_delta += skb->truesize; } __skb_queue_tail(queue, skb); continue; } if (copy > length) copy = length; if (!(rt->dst.dev->features&NETIF_F_SG) && skb_tailroom(skb) >= copy) { unsigned int off; off = skb->len; if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, from, skb_put(skb, copy), offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; } } else if (flags & MSG_SPLICE_PAGES) { struct msghdr *msg = from; err = -EIO; if (WARN_ON_ONCE(copy > msg->msg_iter.count)) goto error; err = skb_splice_from_iter(skb, &msg->msg_iter, copy, sk->sk_allocation); if (err < 0) goto error; copy = err; wmem_alloc_delta += copy; } else if (!zc) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; if (!sk_page_frag_refill(sk, pfrag)) goto error; skb_zcopy_downgrade_managed(skb); if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { err = -EMSGSIZE; if (i == MAX_SKB_FRAGS) goto error; __skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, 0); skb_shinfo(skb)->nr_frags = ++i; get_page(pfrag->page); } copy = min_t(int, copy, pfrag->size - pfrag->offset); if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; pfrag->offset += copy; skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); skb->len += copy; skb->data_len += copy; skb->truesize += copy; wmem_alloc_delta += copy; } else { err = skb_zerocopy_iter_dgram(skb, from, copy); if (err < 0) goto error; } offset += copy; length -= copy; } if (wmem_alloc_delta) refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return 0; error_efault: err = -EFAULT; error: net_zcopy_put_abort(uarg, extra_uref); cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); if (hold_tskey) atomic_dec(&sk->sk_tskey); return err; } int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); int exthdrlen; int err; if (flags&MSG_PROBE) return 0; if (skb_queue_empty(&sk->sk_write_queue)) { /* * setup for corking */ dst_hold(&rt->dst); err = ip6_setup_cork(sk, &inet->cork, &np->cork, ipc6, rt); if (err) return err; inet->cork.fl.u.ip6 = *fl6; exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); length += exthdrlen; transhdrlen += exthdrlen; } else { transhdrlen = 0; } return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, &np->cork, sk_page_frag(sk), getfrag, from, length, transhdrlen, flags, ipc6); } EXPORT_SYMBOL_GPL(ip6_append_data); static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) { struct dst_entry *dst = cork->base.dst; cork->base.dst = NULL; skb_dst_set(skb, dst); } static void ip6_cork_release(struct inet_cork_full *cork, struct inet6_cork *v6_cork) { if (v6_cork->opt) { struct ipv6_txoptions *opt = v6_cork->opt; kfree(opt->dst0opt); kfree(opt->dst1opt); kfree(opt->hopopt); kfree(opt->srcrt); kfree(opt); v6_cork->opt = NULL; } if (cork->base.dst) { dst_release(cork->base.dst); cork->base.dst = NULL; } } struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork, struct inet6_cork *v6_cork) { struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct in6_addr *final_dst; struct net *net = sock_net(sk); struct ipv6hdr *hdr; struct ipv6_txoptions *opt = v6_cork->opt; struct rt6_info *rt = dst_rt6_info(cork->base.dst); struct flowi6 *fl6 = &cork->fl.u.ip6; unsigned char proto = fl6->flowi6_proto; skb = __skb_dequeue(queue); if (!skb) goto out; tail_skb = &(skb_shinfo(skb)->frag_list); /* move skb->data to ip header from ext header */ if (skb->data < skb_network_header(skb)) __skb_pull(skb, skb_network_offset(skb)); while ((tmp_skb = __skb_dequeue(queue)) != NULL) { __skb_pull(tmp_skb, skb_network_header_len(skb)); *tail_skb = tmp_skb; tail_skb = &(tmp_skb->next); skb->len += tmp_skb->len; skb->data_len += tmp_skb->len; skb->truesize += tmp_skb->truesize; tmp_skb->destructor = NULL; tmp_skb->sk = NULL; } /* Allow local fragmentation. */ skb->ignore_df = ip6_sk_ignore_df(sk); __skb_pull(skb, skb_network_header_len(skb)); final_dst = &fl6->daddr; if (opt && opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); if (opt && opt->opt_nflen) ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); ip6_flow_hdr(hdr, v6_cork->tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, ip6_autoflowlabel(net, sk), fl6)); hdr->hop_limit = v6_cork->hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; hdr->daddr = *final_dst; skb->priority = cork->base.priority; skb->mark = cork->base.mark; if (sk_is_tcp(sk)) skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); else skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid); ip6_cork_steal_dst(skb, cork); IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); if (proto == IPPROTO_ICMPV6) { struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); u8 icmp6_type; if (sk->sk_socket->type == SOCK_RAW && !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) icmp6_type = fl6->fl6_icmp_type; else icmp6_type = icmp6_hdr(skb)->icmp6_type; ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } ip6_cork_release(cork, v6_cork); out: return skb; } int ip6_send_skb(struct sk_buff *skb) { struct net *net = sock_net(skb->sk); struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); int err; rcu_read_lock(); err = ip6_local_out(net, skb->sk, skb); if (err) { if (err > 0) err = net_xmit_errno(err); if (err) IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); } rcu_read_unlock(); return err; } int ip6_push_pending_frames(struct sock *sk) { struct sk_buff *skb; skb = ip6_finish_skb(sk); if (!skb) return 0; return ip6_send_skb(skb); } EXPORT_SYMBOL_GPL(ip6_push_pending_frames); static void __ip6_flush_pending_frames(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork, struct inet6_cork *v6_cork) { struct sk_buff *skb; while ((skb = __skb_dequeue_tail(queue)) != NULL) { if (skb_dst(skb)) IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); } ip6_cork_release(cork, v6_cork); } void ip6_flush_pending_frames(struct sock *sk) { __ip6_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork, &inet6_sk(sk)->cork); } EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct rt6_info *rt, unsigned int flags, struct inet_cork_full *cork) { struct inet6_cork v6_cork; struct sk_buff_head queue; int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); int err; if (flags & MSG_PROBE) { dst_release(&rt->dst); return NULL; } __skb_queue_head_init(&queue); cork->base.flags = 0; cork->base.addr = 0; cork->base.opt = NULL; v6_cork.opt = NULL; err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt); if (err) { ip6_cork_release(cork, &v6_cork); return ERR_PTR(err); } if (ipc6->dontfrag < 0) ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk); err = __ip6_append_data(sk, &queue, cork, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, flags, ipc6); if (err) { __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); return ERR_PTR(err); } return __ip6_make_skb(sk, &queue, cork, &v6_cork); } |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 | // SPDX-License-Identifier: GPL-2.0-only /* * OCB mode implementation * * Copyright: (c) 2014 Czech Technical University in Prague * (c) 2014 Volkswagen Group Research * Copyright (C) 2022 - 2024 Intel Corporation * Author: Rostislav Lisovy <rostislav.lisovy@fel.cvut.cz> * Funded by: Volkswagen Group Research */ #include <linux/delay.h> #include <linux/if_ether.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/mac80211.h> #include <linux/unaligned.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #define IEEE80211_OCB_HOUSEKEEPING_INTERVAL (60 * HZ) #define IEEE80211_OCB_PEER_INACTIVITY_LIMIT (240 * HZ) #define IEEE80211_OCB_MAX_STA_ENTRIES 128 /** * enum ocb_deferred_task_flags - mac80211 OCB deferred tasks * @OCB_WORK_HOUSEKEEPING: run the periodic OCB housekeeping tasks * * These flags are used in @wrkq_flags field of &struct ieee80211_if_ocb */ enum ocb_deferred_task_flags { OCB_WORK_HOUSEKEEPING, }; void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, const u8 *addr, u32 supp_rates) { struct ieee80211_if_ocb *ifocb = &sdata->u.ocb; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_supported_band *sband; struct sta_info *sta; int band; /* XXX: Consider removing the least recently used entry and * allow new one to be added. */ if (local->num_sta >= IEEE80211_OCB_MAX_STA_ENTRIES) { net_info_ratelimited("%s: No room for a new OCB STA entry %pM\n", sdata->name, addr); return; } ocb_dbg(sdata, "Adding new OCB station %pM\n", addr); rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON_ONCE(!chanctx_conf)) { rcu_read_unlock(); return; } band = chanctx_conf->def.chan->band; rcu_read_unlock(); sta = sta_info_alloc(sdata, addr, GFP_ATOMIC); if (!sta) return; /* Add only mandatory rates for now */ sband = local->hw.wiphy->bands[band]; sta->sta.deflink.supp_rates[band] = ieee80211_mandatory_rates(sband); spin_lock(&ifocb->incomplete_lock); list_add(&sta->list, &ifocb->incomplete_stations); spin_unlock(&ifocb->incomplete_lock); wiphy_work_queue(local->hw.wiphy, &sdata->work); } static struct sta_info *ieee80211_ocb_finish_sta(struct sta_info *sta) __acquires(RCU) { struct ieee80211_sub_if_data *sdata = sta->sdata; u8 addr[ETH_ALEN]; memcpy(addr, sta->sta.addr, ETH_ALEN); ocb_dbg(sdata, "Adding new IBSS station %pM (dev=%s)\n", addr, sdata->name); sta_info_move_state(sta, IEEE80211_STA_AUTH); sta_info_move_state(sta, IEEE80211_STA_ASSOC); sta_info_move_state(sta, IEEE80211_STA_AUTHORIZED); rate_control_rate_init(&sta->deflink); /* If it fails, maybe we raced another insertion? */ if (sta_info_insert_rcu(sta)) return sta_info_get(sdata, addr); return sta; } static void ieee80211_ocb_housekeeping(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ocb *ifocb = &sdata->u.ocb; ocb_dbg(sdata, "Running ocb housekeeping\n"); ieee80211_sta_expire(sdata, IEEE80211_OCB_PEER_INACTIVITY_LIMIT); mod_timer(&ifocb->housekeeping_timer, round_jiffies(jiffies + IEEE80211_OCB_HOUSEKEEPING_INTERVAL)); } void ieee80211_ocb_work(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ocb *ifocb = &sdata->u.ocb; struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (ifocb->joined != true) return; spin_lock_bh(&ifocb->incomplete_lock); while (!list_empty(&ifocb->incomplete_stations)) { sta = list_first_entry(&ifocb->incomplete_stations, struct sta_info, list); list_del(&sta->list); spin_unlock_bh(&ifocb->incomplete_lock); ieee80211_ocb_finish_sta(sta); rcu_read_unlock(); spin_lock_bh(&ifocb->incomplete_lock); } spin_unlock_bh(&ifocb->incomplete_lock); if (test_and_clear_bit(OCB_WORK_HOUSEKEEPING, &ifocb->wrkq_flags)) ieee80211_ocb_housekeeping(sdata); } static void ieee80211_ocb_housekeeping_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = from_timer(sdata, t, u.ocb.housekeeping_timer); struct ieee80211_local *local = sdata->local; struct ieee80211_if_ocb *ifocb = &sdata->u.ocb; set_bit(OCB_WORK_HOUSEKEEPING, &ifocb->wrkq_flags); wiphy_work_queue(local->hw.wiphy, &sdata->work); } void ieee80211_ocb_setup_sdata(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ocb *ifocb = &sdata->u.ocb; timer_setup(&ifocb->housekeeping_timer, ieee80211_ocb_housekeeping_timer, 0); INIT_LIST_HEAD(&ifocb->incomplete_stations); spin_lock_init(&ifocb->incomplete_lock); } int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata, struct ocb_setup *setup) { struct ieee80211_chan_req chanreq = { .oper = setup->chandef }; struct ieee80211_local *local = sdata->local; struct ieee80211_if_ocb *ifocb = &sdata->u.ocb; u64 changed = BSS_CHANGED_OCB | BSS_CHANGED_BSSID; int err; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (ifocb->joined == true) return -EINVAL; sdata->deflink.operating_11g_mode = true; sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; sdata->deflink.needed_rx_chains = sdata->local->rx_chains; err = ieee80211_link_use_channel(&sdata->deflink, &chanreq, IEEE80211_CHANCTX_SHARED); if (err) return err; ieee80211_bss_info_change_notify(sdata, changed); ifocb->joined = true; set_bit(OCB_WORK_HOUSEKEEPING, &ifocb->wrkq_flags); wiphy_work_queue(local->hw.wiphy, &sdata->work); netif_carrier_on(sdata->dev); return 0; } int ieee80211_ocb_leave(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ocb *ifocb = &sdata->u.ocb; struct ieee80211_local *local = sdata->local; struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); ifocb->joined = false; sta_info_flush(sdata, -1); spin_lock_bh(&ifocb->incomplete_lock); while (!list_empty(&ifocb->incomplete_stations)) { sta = list_first_entry(&ifocb->incomplete_stations, struct sta_info, list); list_del(&sta->list); spin_unlock_bh(&ifocb->incomplete_lock); sta_info_free(local, sta); spin_lock_bh(&ifocb->incomplete_lock); } spin_unlock_bh(&ifocb->incomplete_lock); netif_carrier_off(sdata->dev); clear_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_OCB); ieee80211_link_release_channel(&sdata->deflink); skb_queue_purge(&sdata->skb_queue); del_timer_sync(&sdata->u.ocb.housekeeping_timer); /* If the timer fired while we waited for it, it will have * requeued the work. Now the work will be running again * but will not rearm the timer again because it checks * whether we are connected to the network or not -- at this * point we shouldn't be anymore. */ return 0; } |
| 18 17 1 16 5 1 10 5 11 14 15 15 2 3 1 9 10 1 1 17 5 2 25 25 3 16 19 4 1 15 10 5 1 2 1 12 4 1 4 1 2 1 10 11 5 1 9 10 10 10 4 4 4 4 11 11 4 11 11 10 10 10 1 104 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 | // SPDX-License-Identifier: GPL-2.0-only /* * Connection tracking protocol helper module for SCTP. * * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com> * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net> * * SCTP is defined in RFC 2960. References to various sections in this code * are to this RFC. */ #include <linux/types.h> #include <linux/timer.h> #include <linux/netfilter.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/sctp.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <net/sctp/checksum.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> static const char *const sctp_conntrack_names[] = { [SCTP_CONNTRACK_NONE] = "NONE", [SCTP_CONNTRACK_CLOSED] = "CLOSED", [SCTP_CONNTRACK_COOKIE_WAIT] = "COOKIE_WAIT", [SCTP_CONNTRACK_COOKIE_ECHOED] = "COOKIE_ECHOED", [SCTP_CONNTRACK_ESTABLISHED] = "ESTABLISHED", [SCTP_CONNTRACK_SHUTDOWN_SENT] = "SHUTDOWN_SENT", [SCTP_CONNTRACK_SHUTDOWN_RECD] = "SHUTDOWN_RECD", [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = "SHUTDOWN_ACK_SENT", [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT", }; static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_CLOSED] = secs_to_jiffies(10), [SCTP_CONNTRACK_COOKIE_WAIT] = secs_to_jiffies(3), [SCTP_CONNTRACK_COOKIE_ECHOED] = secs_to_jiffies(3), [SCTP_CONNTRACK_ESTABLISHED] = secs_to_jiffies(210), [SCTP_CONNTRACK_SHUTDOWN_SENT] = secs_to_jiffies(3), [SCTP_CONNTRACK_SHUTDOWN_RECD] = secs_to_jiffies(3), [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = secs_to_jiffies(3), [SCTP_CONNTRACK_HEARTBEAT_SENT] = secs_to_jiffies(30), }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 #define sNO SCTP_CONNTRACK_NONE #define sCL SCTP_CONNTRACK_CLOSED #define sCW SCTP_CONNTRACK_COOKIE_WAIT #define sCE SCTP_CONNTRACK_COOKIE_ECHOED #define sES SCTP_CONNTRACK_ESTABLISHED #define sSS SCTP_CONNTRACK_SHUTDOWN_SENT #define sSR SCTP_CONNTRACK_SHUTDOWN_RECD #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT #define sHS SCTP_CONNTRACK_HEARTBEAT_SENT #define sIV SCTP_CONNTRACK_MAX /* These are the descriptions of the states: NOTE: These state names are tantalizingly similar to the states of an SCTP endpoint. But the interpretation of the states is a little different, considering that these are the states of the connection and not of an end point. Please note the subtleties. -Kiran NONE - Nothing so far. COOKIE WAIT - We have seen an INIT chunk in the original direction, or also an INIT_ACK chunk in the reply direction. COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply direction. SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite to that of the SHUTDOWN chunk. CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. */ /* TODO - I have assumed that the first INIT is in the original direction. This messes things when an INIT comes in the reply direction in CLOSED state. - Check the error type in the reply dir before transitioning from cookie echoed to closed. - Sec 5.2.4 of RFC 2960 - Full Multi Homing support. */ /* SCTP conntrack state transitions */ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ /* init */ {sCL, sCL, sCW, sCE, sES, sCL, sCL, sSA, sCW}, /* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL}, /* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, /* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL}, /* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA}, /* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/ /* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */ /* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ /* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL}, /* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, }, { /* REPLY */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */ /* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* INIT in sCL Big TODO */ /* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV}, /* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV}, /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV}, /* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV}, /* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ /* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV}, /* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV}, /* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sES}, } }; #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { seq_printf(s, "%s ", sctp_conntrack_names[ct->proto.sctp.state]); } #endif /* do_basic_checks ensures sch->length > 0, do not use before */ #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \ (offset) < (skb)->len && \ ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \ (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++) /* Some validity checks to make sure the chunks are fine */ static int do_basic_checks(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned long *map, const struct nf_hook_state *state) { u_int32_t offset, count; struct sctp_chunkhdr _sch, *sch; int flag; flag = 0; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { if (sch->type == SCTP_CID_INIT || sch->type == SCTP_CID_INIT_ACK || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) flag = 1; /* * Cookie Ack/Echo chunks not the first OR * Init / Init Ack / Shutdown compl chunks not the only chunks * OR zero-length. */ if (((sch->type == SCTP_CID_COOKIE_ACK || sch->type == SCTP_CID_COOKIE_ECHO || flag) && count != 0) || !sch->length) { nf_ct_l4proto_log_invalid(skb, ct, state, "%s failed. chunk num %d, type %d, len %d flag %d\n", __func__, count, sch->type, sch->length, flag); return 1; } if (map) set_bit(sch->type, map); } return count == 0; } static int sctp_new_state(enum ip_conntrack_dir dir, enum sctp_conntrack cur_state, int chunk_type) { int i; switch (chunk_type) { case SCTP_CID_INIT: i = 0; break; case SCTP_CID_INIT_ACK: i = 1; break; case SCTP_CID_ABORT: i = 2; break; case SCTP_CID_SHUTDOWN: i = 3; break; case SCTP_CID_SHUTDOWN_ACK: i = 4; break; case SCTP_CID_ERROR: i = 5; break; case SCTP_CID_COOKIE_ECHO: i = 6; break; case SCTP_CID_COOKIE_ACK: i = 7; break; case SCTP_CID_SHUTDOWN_COMPLETE: i = 8; break; case SCTP_CID_HEARTBEAT: i = 9; break; case SCTP_CID_HEARTBEAT_ACK: i = 10; break; default: /* Other chunks like DATA or SACK do not change the state */ pr_debug("Unknown chunk type %d, Will stay in %s\n", chunk_type, sctp_conntrack_names[cur_state]); return cur_state; } return sctp_conntracks[dir][i][cur_state]; } /* Don't need lock here: this conntrack not in circulation yet */ static noinline bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, const struct sctphdr *sh, unsigned int dataoff) { enum sctp_conntrack new_state; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u32 offset, count; memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); new_state = SCTP_CONNTRACK_MAX; for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) { new_state = sctp_new_state(IP_CT_DIR_ORIGINAL, SCTP_CONNTRACK_NONE, sch->type); /* Invalid: delete conntrack */ if (new_state == SCTP_CONNTRACK_NONE || new_state == SCTP_CONNTRACK_MAX) { pr_debug("nf_conntrack_sctp: invalid new deleting.\n"); return false; } /* Copy the vtag into the state info */ if (sch->type == SCTP_CID_INIT) { struct sctp_inithdr _inithdr, *ih; /* Sec 8.5.1 (A) */ if (sh->vtag) return false; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(_inithdr), &_inithdr); if (!ih) return false; pr_debug("Setting vtag %x for new conn\n", ih->init_tag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; } else if (sch->type == SCTP_CID_HEARTBEAT) { pr_debug("Setting vtag %x for secondary conntrack\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; } else if (sch->type == SCTP_CID_SHUTDOWN_ACK) { /* If it is a shutdown ack OOTB packet, we expect a return shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ pr_debug("Setting vtag %x for new conn OOTB\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; } ct->proto.sctp.state = SCTP_CONNTRACK_NONE; } return true; } static bool sctp_error(struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { const struct sctphdr *sh; const char *logmsg; if (skb->len < dataoff + sizeof(struct sctphdr)) { logmsg = "nf_ct_sctp: short packet "; goto out_invalid; } if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && skb->ip_summed == CHECKSUM_NONE) { if (skb_ensure_writable(skb, dataoff + sizeof(*sh))) { logmsg = "nf_ct_sctp: failed to read header "; goto out_invalid; } sh = (const struct sctphdr *)(skb->data + dataoff); if (sh->checksum != sctp_compute_cksum(skb, dataoff)) { logmsg = "nf_ct_sctp: bad CRC "; goto out_invalid; } skb->ip_summed = CHECKSUM_UNNECESSARY; } return false; out_invalid: nf_l4proto_log_invalid(skb, state, IPPROTO_SCTP, "%s", logmsg); return true; } /* Returns verdict for packet, or -NF_ACCEPT for invalid. */ int nf_conntrack_sctp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { enum sctp_conntrack new_state, old_state; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); const struct sctphdr *sh; struct sctphdr _sctph; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u_int32_t offset, count; unsigned int *timeouts; unsigned long map[256 / sizeof(unsigned long)] = { 0 }; bool ignore = false; if (sctp_error(skb, dataoff, state)) return -NF_ACCEPT; sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); if (sh == NULL) goto out; if (do_basic_checks(ct, skb, dataoff, map, state) != 0) goto out; if (!nf_ct_is_confirmed(ct)) { /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ if (test_bit(SCTP_CID_ABORT, map) || test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) || test_bit(SCTP_CID_COOKIE_ACK, map)) return -NF_ACCEPT; if (!sctp_new(ct, skb, sh, dataoff)) return -NF_ACCEPT; } /* Check the verification tag (Sec 8.5) */ if (!test_bit(SCTP_CID_INIT, map) && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && !test_bit(SCTP_CID_COOKIE_ECHO, map) && !test_bit(SCTP_CID_ABORT, map) && !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && !test_bit(SCTP_CID_HEARTBEAT, map) && !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && sh->vtag != ct->proto.sctp.vtag[dir]) { nf_ct_l4proto_log_invalid(skb, ct, state, "verification tag check failed %x vs %x for dir %d", sh->vtag, ct->proto.sctp.vtag[dir], dir); goto out; } old_state = new_state = SCTP_CONNTRACK_NONE; spin_lock_bh(&ct->lock); for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { /* Special cases of Verification tag check (Sec 8.5.1) */ if (sch->type == SCTP_CID_INIT) { /* (A) vtag MUST be zero */ if (sh->vtag != 0) goto out_unlock; } else if (sch->type == SCTP_CID_ABORT) { /* (B) vtag MUST match own vtag if T flag is unset OR * MUST match peer's vtag if T flag is set */ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[dir]) || ((sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { /* (C) vtag MUST match own vtag if T flag is unset OR * MUST match peer's vtag if T flag is set */ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[dir]) || ((sch->flags & SCTP_CHUNK_FLAG_T) && sh->vtag != ct->proto.sctp.vtag[!dir])) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ECHO) { /* (D) vtag must be same as init_vtag as found in INIT_ACK */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; } else if (sch->type == SCTP_CID_COOKIE_ACK) { ct->proto.sctp.init[dir] = 0; ct->proto.sctp.init[!dir] = 0; } else if (sch->type == SCTP_CID_HEARTBEAT) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { if (test_bit(SCTP_CID_DATA, map) || ignore) goto out_unlock; ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED; ct->proto.sctp.last_dir = dir; ignore = true; continue; } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } else if (sch->type == SCTP_CID_HEARTBEAT_ACK) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir); ct->proto.sctp.vtag[dir] = sh->vtag; } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { if (test_bit(SCTP_CID_DATA, map) || ignore) goto out_unlock; if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 || ct->proto.sctp.last_dir == dir) goto out_unlock; ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; ct->proto.sctp.vtag[dir] = sh->vtag; ct->proto.sctp.vtag[!dir] = 0; } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } } old_state = ct->proto.sctp.state; new_state = sctp_new_state(dir, old_state, sch->type); /* Invalid */ if (new_state == SCTP_CONNTRACK_MAX) { nf_ct_l4proto_log_invalid(skb, ct, state, "Invalid, old_state %d, dir %d, type %d", old_state, dir, sch->type); goto out_unlock; } /* If it is an INIT or an INIT ACK note down the vtag */ if (sch->type == SCTP_CID_INIT) { struct sctp_inithdr _ih, *ih; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); if (!ih) goto out_unlock; if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) ct->proto.sctp.init[!dir] = 0; ct->proto.sctp.init[dir] = 1; pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; /* don't renew timeout on init retransmit so * port reuse by client or NAT middlebox cannot * keep entry alive indefinitely (incl. nat info). */ if (new_state == SCTP_CONNTRACK_CLOSED && old_state == SCTP_CONNTRACK_CLOSED && nf_ct_is_confirmed(ct)) ignore = true; } else if (sch->type == SCTP_CID_INIT_ACK) { struct sctp_inithdr _ih, *ih; __be32 vtag; ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); if (!ih) goto out_unlock; vtag = ct->proto.sctp.vtag[!dir]; if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag) goto out_unlock; /* collision */ if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] && vtag != ih->init_tag) goto out_unlock; pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; } ct->proto.sctp.state = new_state; if (old_state != new_state) { nf_conntrack_event_cache(IPCT_PROTOINFO, ct); if (new_state == SCTP_CONNTRACK_ESTABLISHED && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } } spin_unlock_bh(&ct->lock); /* allow but do not refresh timeout */ if (ignore) return NF_ACCEPT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; out_unlock: spin_unlock_bh(&ct->lock); out: return -NF_ACCEPT; } static bool sctp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.sctp.state) { case SCTP_CONNTRACK_SHUTDOWN_SENT: case SCTP_CONNTRACK_SHUTDOWN_RECD: case SCTP_CONNTRACK_SHUTDOWN_ACK_SENT: return true; default: break; } return false; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; spin_lock_bh(&ct->lock); nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state)) goto nla_put_failure; if (destroy) goto skip_state; if (nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) || nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY, ct->proto.sctp.vtag[IP_CT_DIR_REPLY])) goto nla_put_failure; skip_state: spin_unlock_bh(&ct->lock); nla_nest_end(skb, nest_parms); return 0; nla_put_failure: spin_unlock_bh(&ct->lock); return -1; } static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = { [CTA_PROTOINFO_SCTP_STATE] = { .type = NLA_U8 }, [CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] = { .type = NLA_U32 }, [CTA_PROTOINFO_SCTP_VTAG_REPLY] = { .type = NLA_U32 }, }; #define SCTP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 4) + \ NLA_ALIGN(NLA_HDRLEN + 4)) static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct) { struct nlattr *attr = cda[CTA_PROTOINFO_SCTP]; struct nlattr *tb[CTA_PROTOINFO_SCTP_MAX+1]; int err; /* updates may not contain the internal protocol info, skip parsing */ if (!attr) return 0; err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_SCTP_MAX, attr, sctp_nla_policy, NULL); if (err < 0) return err; if (!tb[CTA_PROTOINFO_SCTP_STATE] || !tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] || !tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]) return -EINVAL; spin_lock_bh(&ct->lock); ct->proto.sctp.state = nla_get_u8(tb[CTA_PROTOINFO_SCTP_STATE]); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]); spin_unlock_bh(&ct->lock); return 0; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; if (!timeouts) timeouts = sn->timeouts; /* set default SCTP timeouts. */ for (i=0; i<SCTP_CONNTRACK_MAX; i++) timeouts[i] = sn->timeouts[i]; /* there's a 1:1 mapping between attributes and protocol states. */ for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { if (tb[i]) { timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; } } timeouts[CTA_TIMEOUT_SCTP_UNSPEC] = timeouts[CTA_TIMEOUT_SCTP_CLOSED]; return 0; } static int sctp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; int i; for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) goto nla_put_failure; } return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { [CTA_TIMEOUT_SCTP_CLOSED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_COOKIE_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_COOKIE_ECHOED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_ESTABLISHED] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_sctp_init_net(struct net *net) { struct nf_sctp_net *sn = nf_sctp_pernet(net); int i; for (i = 0; i < SCTP_CONNTRACK_MAX; i++) sn->timeouts[i] = sctp_timeouts[i]; /* timeouts[0] is unused, init it so ->timeouts[0] contains * 'new' timeout, like udp or icmp. */ sn->timeouts[0] = sctp_timeouts[SCTP_CONNTRACK_CLOSED]; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = { .l4proto = IPPROTO_SCTP, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif .can_early_drop = sctp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_size = SCTP_NLATTR_SIZE, .to_nlattr = sctp_to_nlattr, .from_nlattr = nlattr_to_sctp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = sctp_timeout_nlattr_to_obj, .obj_to_nlattr = sctp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_SCTP_MAX, .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, .nla_policy = sctp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; |
| 3 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _LINUX_IO_URING_H #define _LINUX_IO_URING_H #include <linux/sched.h> #include <linux/xarray.h> #include <uapi/linux/io_uring.h> #if defined(CONFIG_IO_URING) void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); const char *io_uring_get_opcode(u8 opcode); bool io_is_uring_fops(struct file *file); static inline void io_uring_files_cancel(void) { if (current->io_uring) __io_uring_cancel(false); } static inline void io_uring_task_cancel(void) { if (current->io_uring) __io_uring_cancel(true); } static inline void io_uring_free(struct task_struct *tsk) { if (tsk->io_uring) __io_uring_free(tsk); } #else static inline void io_uring_task_cancel(void) { } static inline void io_uring_files_cancel(void) { } static inline void io_uring_free(struct task_struct *tsk) { } static inline const char *io_uring_get_opcode(u8 opcode) { return ""; } static inline bool io_is_uring_fops(struct file *file) { return false; } #endif #endif |
| 10 2 8 9 3 7 9 9 8 2 2 2 10 10 2 3 3 3 3 3 2 2 2 2 2 2 2 4 4 4 4 6 6 6 6 759 759 1 4 6 6 6 4 1 1 2 3 7 1 2 6 150 1 4 1 4 143 2 2 10 10 4 10 5 6 10 4 4 4 4 4 258 250 16 115 117 5 5 1 7 1 1 1 2 3 4 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 | // SPDX-License-Identifier: GPL-2.0-only /* * Memory merging support. * * This code enables dynamic sharing of identical pages found in different * memory areas, even if they are not shared by fork() * * Copyright (C) 2008-2009 Red Hat, Inc. * Authors: * Izik Eidus * Andrea Arcangeli * Chris Wright * Hugh Dickins */ #include <linux/errno.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/fs.h> #include <linux/mman.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/cputime.h> #include <linux/rwsem.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/spinlock.h> #include <linux/xxhash.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/memory.h> #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/ksm.h> #include <linux/hashtable.h> #include <linux/freezer.h> #include <linux/oom.h> #include <linux/numa.h> #include <linux/pagewalk.h> #include <asm/tlbflush.h> #include "internal.h" #include "mm_slot.h" #define CREATE_TRACE_POINTS #include <trace/events/ksm.h> #ifdef CONFIG_NUMA #define NUMA(x) (x) #define DO_NUMA(x) do { (x); } while (0) #else #define NUMA(x) (0) #define DO_NUMA(x) do { } while (0) #endif typedef u8 rmap_age_t; /** * DOC: Overview * * A few notes about the KSM scanning process, * to make it easier to understand the data structures below: * * In order to reduce excessive scanning, KSM sorts the memory pages by their * contents into a data structure that holds pointers to the pages' locations. * * Since the contents of the pages may change at any moment, KSM cannot just * insert the pages into a normal sorted tree and expect it to find anything. * Therefore KSM uses two data structures - the stable and the unstable tree. * * The stable tree holds pointers to all the merged pages (ksm pages), sorted * by their contents. Because each such page is write-protected, searching on * this tree is fully assured to be working (except when pages are unmapped), * and therefore this tree is called the stable tree. * * The stable tree node includes information required for reverse * mapping from a KSM page to virtual addresses that map this page. * * In order to avoid large latencies of the rmap walks on KSM pages, * KSM maintains two types of nodes in the stable tree: * * * the regular nodes that keep the reverse mapping structures in a * linked list * * the "chains" that link nodes ("dups") that represent the same * write protected memory content, but each "dup" corresponds to a * different KSM page copy of that content * * Internally, the regular nodes, "dups" and "chains" are represented * using the same struct ksm_stable_node structure. * * In addition to the stable tree, KSM uses a second data structure called the * unstable tree: this tree holds pointers to pages which have been found to * be "unchanged for a period of time". The unstable tree sorts these pages * by their contents, but since they are not write-protected, KSM cannot rely * upon the unstable tree to work correctly - the unstable tree is liable to * be corrupted as its contents are modified, and so it is called unstable. * * KSM solves this problem by several techniques: * * 1) The unstable tree is flushed every time KSM completes scanning all * memory areas, and then the tree is rebuilt again from the beginning. * 2) KSM will only insert into the unstable tree, pages whose hash value * has not changed since the previous scan of all memory areas. * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the * colors of the nodes and not on their contents, assuring that even when * the tree gets "corrupted" it won't get out of balance, so scanning time * remains the same (also, searching and inserting nodes in an rbtree uses * the same algorithm, so we have no overhead when we flush and rebuild). * 4) KSM never flushes the stable tree, which means that even if it were to * take 10 attempts to find a page in the unstable tree, once it is found, * it is secured in the stable tree. (When we scan a new page, we first * compare it against the stable tree, and then against the unstable tree.) * * If the merge_across_nodes tunable is unset, then KSM maintains multiple * stable trees and multiple unstable trees: one of each for each NUMA node. */ /** * struct ksm_mm_slot - ksm information per mm that is being scanned * @slot: hash lookup from mm to mm_slot * @rmap_list: head for this mm_slot's singly-linked list of rmap_items */ struct ksm_mm_slot { struct mm_slot slot; struct ksm_rmap_item *rmap_list; }; /** * struct ksm_scan - cursor for scanning * @mm_slot: the current mm_slot we are scanning * @address: the next address inside that to be scanned * @rmap_list: link to the next rmap to be scanned in the rmap_list * @seqnr: count of completed full scans (needed when removing unstable node) * * There is only the one ksm_scan instance of this cursor structure. */ struct ksm_scan { struct ksm_mm_slot *mm_slot; unsigned long address; struct ksm_rmap_item **rmap_list; unsigned long seqnr; }; /** * struct ksm_stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list * @hlist_dup: linked into the stable_node->hlist with a stable_node chain * @list: linked into migrate_nodes, pending placement in the proper node tree * @hlist: hlist head of rmap_items using this ksm page * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) * @chain_prune_time: time of the last full garbage collection * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ struct ksm_stable_node { union { struct rb_node node; /* when node of stable tree */ struct { /* when listed for migration */ struct list_head *head; struct { struct hlist_node hlist_dup; struct list_head list; }; }; }; struct hlist_head hlist; union { unsigned long kpfn; unsigned long chain_prune_time; }; /* * STABLE_NODE_CHAIN can be any negative number in * rmap_hlist_len negative range, but better not -1 to be able * to reliably detect underflows. */ #define STABLE_NODE_CHAIN -1024 int rmap_hlist_len; #ifdef CONFIG_NUMA int nid; #endif }; /** * struct ksm_rmap_item - reverse mapping item for virtual addresses * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree * @nid: NUMA node id of unstable tree in which linked (may not match page) * @mm: the memory structure this rmap_item is pointing into * @address: the virtual address this rmap_item tracks (+ flags in low bits) * @oldchecksum: previous checksum of the page at that virtual address * @node: rb node of this rmap_item in the unstable tree * @head: pointer to stable_node heading this list in the stable tree * @hlist: link into hlist of rmap_items hanging off that stable_node * @age: number of scan iterations since creation * @remaining_skips: how many scans to skip */ struct ksm_rmap_item { struct ksm_rmap_item *rmap_list; union { struct anon_vma *anon_vma; /* when stable */ #ifdef CONFIG_NUMA int nid; /* when node of unstable tree */ #endif }; struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ unsigned int oldchecksum; /* when unstable */ rmap_age_t age; rmap_age_t remaining_skips; union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ struct ksm_stable_node *head; struct hlist_node hlist; }; }; }; #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; static struct rb_root one_unstable_tree[1] = { RB_ROOT }; static struct rb_root *root_stable_tree = one_stable_tree; static struct rb_root *root_unstable_tree = one_unstable_tree; /* Recently migrated nodes of stable tree, pending proper placement */ static LIST_HEAD(migrate_nodes); #define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev) #define MM_SLOTS_HASH_BITS 10 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct ksm_mm_slot ksm_mm_head = { .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node), }; static struct ksm_scan ksm_scan = { .mm_slot = &ksm_mm_head, }; static struct kmem_cache *rmap_item_cache; static struct kmem_cache *stable_node_cache; static struct kmem_cache *mm_slot_cache; /* Default number of pages to scan per batch */ #define DEFAULT_PAGES_TO_SCAN 100 /* The number of pages scanned */ static unsigned long ksm_pages_scanned; /* The number of nodes in the stable tree */ static unsigned long ksm_pages_shared; /* The number of page slots additionally sharing those nodes */ static unsigned long ksm_pages_sharing; /* The number of nodes in the unstable tree */ static unsigned long ksm_pages_unshared; /* The number of rmap_items in use: to calculate pages_volatile */ static unsigned long ksm_rmap_items; /* The number of stable_node chains */ static unsigned long ksm_stable_node_chains; /* The number of stable_node dups linked to the stable_node chains */ static unsigned long ksm_stable_node_dups; /* Delay in pruning stale stable_node_dups in the stable_node_chains */ static unsigned int ksm_stable_node_chains_prune_millisecs = 2000; /* Maximum number of page slots sharing a stable node */ static int ksm_max_page_sharing = 256; /* Number of pages ksmd should scan in one batch */ static unsigned int ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN; /* Milliseconds ksmd should sleep between batches */ static unsigned int ksm_thread_sleep_millisecs = 20; /* Checksum of an empty (zeroed) page */ static unsigned int zero_checksum __read_mostly; /* Whether to merge empty (zeroed) pages with actual zero pages */ static bool ksm_use_zero_pages __read_mostly; /* Skip pages that couldn't be de-duplicated previously */ /* Default to true at least temporarily, for testing */ static bool ksm_smart_scan = true; /* The number of zero pages which is placed by KSM */ atomic_long_t ksm_zero_pages = ATOMIC_LONG_INIT(0); /* The number of pages that have been skipped due to "smart scanning" */ static unsigned long ksm_pages_skipped; /* Don't scan more than max pages per batch. */ static unsigned long ksm_advisor_max_pages_to_scan = 30000; /* Min CPU for scanning pages per scan */ #define KSM_ADVISOR_MIN_CPU 10 /* Max CPU for scanning pages per scan */ static unsigned int ksm_advisor_max_cpu = 70; /* Target scan time in seconds to analyze all KSM candidate pages. */ static unsigned long ksm_advisor_target_scan_time = 200; /* Exponentially weighted moving average. */ #define EWMA_WEIGHT 30 /** * struct advisor_ctx - metadata for KSM advisor * @start_scan: start time of the current scan * @scan_time: scan time of previous scan * @change: change in percent to pages_to_scan parameter * @cpu_time: cpu time consumed by the ksmd thread in the previous scan */ struct advisor_ctx { ktime_t start_scan; unsigned long scan_time; unsigned long change; unsigned long long cpu_time; }; static struct advisor_ctx advisor_ctx; /* Define different advisor's */ enum ksm_advisor_type { KSM_ADVISOR_NONE, KSM_ADVISOR_SCAN_TIME, }; static enum ksm_advisor_type ksm_advisor; #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: */ /* At least scan this many pages per batch. */ static unsigned long ksm_advisor_min_pages_to_scan = 500; static void set_advisor_defaults(void) { if (ksm_advisor == KSM_ADVISOR_NONE) { ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN; } else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) { advisor_ctx = (const struct advisor_ctx){ 0 }; ksm_thread_pages_to_scan = ksm_advisor_min_pages_to_scan; } } #endif /* CONFIG_SYSFS */ static inline void advisor_start_scan(void) { if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) advisor_ctx.start_scan = ktime_get(); } /* * Use previous scan time if available, otherwise use current scan time as an * approximation for the previous scan time. */ static inline unsigned long prev_scan_time(struct advisor_ctx *ctx, unsigned long scan_time) { return ctx->scan_time ? ctx->scan_time : scan_time; } /* Calculate exponential weighted moving average */ static unsigned long ewma(unsigned long prev, unsigned long curr) { return ((100 - EWMA_WEIGHT) * prev + EWMA_WEIGHT * curr) / 100; } /* * The scan time advisor is based on the current scan rate and the target * scan rate. * * new_pages_to_scan = pages_to_scan * (scan_time / target_scan_time) * * To avoid perturbations it calculates a change factor of previous changes. * A new change factor is calculated for each iteration and it uses an * exponentially weighted moving average. The new pages_to_scan value is * multiplied with that change factor: * * new_pages_to_scan *= change facor * * The new_pages_to_scan value is limited by the cpu min and max values. It * calculates the cpu percent for the last scan and calculates the new * estimated cpu percent cost for the next scan. That value is capped by the * cpu min and max setting. * * In addition the new pages_to_scan value is capped by the max and min * limits. */ static void scan_time_advisor(void) { unsigned int cpu_percent; unsigned long cpu_time; unsigned long cpu_time_diff; unsigned long cpu_time_diff_ms; unsigned long pages; unsigned long per_page_cost; unsigned long factor; unsigned long change; unsigned long last_scan_time; unsigned long scan_time; /* Convert scan time to seconds */ scan_time = div_s64(ktime_ms_delta(ktime_get(), advisor_ctx.start_scan), MSEC_PER_SEC); scan_time = scan_time ? scan_time : 1; /* Calculate CPU consumption of ksmd background thread */ cpu_time = task_sched_runtime(current); cpu_time_diff = cpu_time - advisor_ctx.cpu_time; cpu_time_diff_ms = cpu_time_diff / 1000 / 1000; cpu_percent = (cpu_time_diff_ms * 100) / (scan_time * 1000); cpu_percent = cpu_percent ? cpu_percent : 1; last_scan_time = prev_scan_time(&advisor_ctx, scan_time); /* Calculate scan time as percentage of target scan time */ factor = ksm_advisor_target_scan_time * 100 / scan_time; factor = factor ? factor : 1; /* * Calculate scan time as percentage of last scan time and use * exponentially weighted average to smooth it */ change = scan_time * 100 / last_scan_time; change = change ? change : 1; change = ewma(advisor_ctx.change, change); /* Calculate new scan rate based on target scan rate. */ pages = ksm_thread_pages_to_scan * 100 / factor; /* Update pages_to_scan by weighted change percentage. */ pages = pages * change / 100; /* Cap new pages_to_scan value */ per_page_cost = ksm_thread_pages_to_scan / cpu_percent; per_page_cost = per_page_cost ? per_page_cost : 1; pages = min(pages, per_page_cost * ksm_advisor_max_cpu); pages = max(pages, per_page_cost * KSM_ADVISOR_MIN_CPU); pages = min(pages, ksm_advisor_max_pages_to_scan); /* Update advisor context */ advisor_ctx.change = change; advisor_ctx.scan_time = scan_time; advisor_ctx.cpu_time = cpu_time; ksm_thread_pages_to_scan = pages; trace_ksm_advisor(scan_time, pages, cpu_percent); } static void advisor_stop_scan(void) { if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) scan_time_advisor(); } #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; static int ksm_nr_node_ids = 1; #else #define ksm_merge_across_nodes 1U #define ksm_nr_node_ids 1 #endif #define KSM_RUN_STOP 0 #define KSM_RUN_MERGE 1 #define KSM_RUN_UNMERGE 2 #define KSM_RUN_OFFLINE 4 static unsigned long ksm_run = KSM_RUN_STOP; static void wait_while_offlining(void); static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); static DEFINE_MUTEX(ksm_thread_mutex); static DEFINE_SPINLOCK(ksm_mmlist_lock); static int __init ksm_slab_init(void) { rmap_item_cache = KMEM_CACHE(ksm_rmap_item, 0); if (!rmap_item_cache) goto out; stable_node_cache = KMEM_CACHE(ksm_stable_node, 0); if (!stable_node_cache) goto out_free1; mm_slot_cache = KMEM_CACHE(ksm_mm_slot, 0); if (!mm_slot_cache) goto out_free2; return 0; out_free2: kmem_cache_destroy(stable_node_cache); out_free1: kmem_cache_destroy(rmap_item_cache); out: return -ENOMEM; } static void __init ksm_slab_free(void) { kmem_cache_destroy(mm_slot_cache); kmem_cache_destroy(stable_node_cache); kmem_cache_destroy(rmap_item_cache); mm_slot_cache = NULL; } static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain) { return chain->rmap_hlist_len == STABLE_NODE_CHAIN; } static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup) { return dup->head == STABLE_NODE_DUP_HEAD; } static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup, struct ksm_stable_node *chain) { VM_BUG_ON(is_stable_node_dup(dup)); dup->head = STABLE_NODE_DUP_HEAD; VM_BUG_ON(!is_stable_node_chain(chain)); hlist_add_head(&dup->hlist_dup, &chain->hlist); ksm_stable_node_dups++; } static inline void __stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(!is_stable_node_dup(dup)); hlist_del(&dup->hlist_dup); ksm_stable_node_dups--; } static inline void stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(is_stable_node_chain(dup)); if (is_stable_node_dup(dup)) __stable_node_dup_del(dup); else rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid)); #ifdef CONFIG_DEBUG_VM dup->head = NULL; #endif } static inline struct ksm_rmap_item *alloc_rmap_item(void) { struct ksm_rmap_item *rmap_item; rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); if (rmap_item) ksm_rmap_items++; return rmap_item; } static inline void free_rmap_item(struct ksm_rmap_item *rmap_item) { ksm_rmap_items--; rmap_item->mm->ksm_rmap_items--; rmap_item->mm = NULL; /* debug safety */ kmem_cache_free(rmap_item_cache, rmap_item); } static inline struct ksm_stable_node *alloc_stable_node(void) { /* * The allocation can take too long with GFP_KERNEL when memory is under * pressure, which may lead to hung task warnings. Adding __GFP_HIGH * grants access to memory reserves, helping to avoid this problem. */ return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH); } static inline void free_stable_node(struct ksm_stable_node *stable_node) { VM_BUG_ON(stable_node->rmap_hlist_len && !is_stable_node_chain(stable_node)); kmem_cache_free(stable_node_cache, stable_node); } /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, * takes mmap_lock briefly to serialize against them. ksm_exit() does not set * a special flag: they can just back out as soon as mm_users goes to zero. * ksm_test_exit() is used throughout to make this test for exit: in some * places for correctness, in some places just to avoid unnecessary work. */ static inline bool ksm_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } /* * We use break_ksm to break COW on a ksm page by triggering unsharing, * such that the ksm page will get replaced by an exclusive anonymous page. * * We take great care only to touch a ksm page, in a VM_MERGEABLE vma, * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP * mmap of /dev/mem, where we would not want to touch it. * * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma) { vm_fault_t ret = 0; if (lock_vma) vma_start_write(vma); do { bool ksm_page = false; struct folio_walk fw; struct folio *folio; cond_resched(); folio = folio_walk_start(&fw, vma, addr, FW_MIGRATION | FW_ZEROPAGE); if (folio) { /* Small folio implies FW_LEVEL_PTE. */ if (!folio_test_large(folio) && (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte))) ksm_page = true; folio_walk_end(&fw, vma); } if (!ksm_page) return 0; ret = handle_mm_fault(vma, addr, FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, NULL); } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); /* * We must loop until we no longer find a KSM page because * handle_mm_fault() may back out if there's any difficulty e.g. if * pte accessed bit gets updated concurrently. * * VM_FAULT_SIGBUS could occur if we race with truncation of the * backing file, which also invalidates anonymous pages: that's * okay, that truncation will have unmapped the KSM page for us. * * VM_FAULT_OOM: at the time of writing (late July 2009), setting * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the * current task has TIF_MEMDIE set, and will be OOM killed on return * to user; and ksmd, having no mm, would never be chosen for that. * * But if the mm is in a limited mem_cgroup, then the fault may fail * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and * even ksmd can fail in this way - though it's usually breaking ksm * just to undo a merge it made a moment before, so unlikely to oom. * * That's a pity: we might therefore have more kernel pages allocated * than we're counting as nodes in the stable tree; but ksm_do_scan * will retry to break_cow on each pass, so should recover the page * in due course. The important thing is to not let VM_MERGEABLE * be cleared while any such pages might remain in the area. */ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } static bool vma_ksm_compatible(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_HUGETLB | VM_MIXEDMAP| VM_DROPPABLE)) return false; /* just ignore the advice */ if (vma_is_dax(vma)) return false; #ifdef VM_SAO if (vma->vm_flags & VM_SAO) return false; #endif #ifdef VM_SPARC_ADI if (vma->vm_flags & VM_SPARC_ADI) return false; #endif return true; } static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; if (ksm_test_exit(mm)) return NULL; vma = vma_lookup(mm, addr); if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) return NULL; return vma; } static void break_cow(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; struct vm_area_struct *vma; /* * It is not an accident that whenever we want to break COW * to undo, we also need to drop a reference to the anon_vma. */ put_anon_vma(rmap_item->anon_vma); mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (vma) break_ksm(vma, addr, false); mmap_read_unlock(mm); } static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; struct vm_area_struct *vma; struct page *page = NULL; struct folio_walk fw; struct folio *folio; mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (!vma) goto out; folio = folio_walk_start(&fw, vma, addr, 0); if (folio) { if (!folio_is_zone_device(folio) && folio_test_anon(folio)) { folio_get(folio); page = fw.page; } folio_walk_end(&fw, vma); } out: if (page) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } mmap_read_unlock(mm); return page; } /* * This helper is used for getting right index into array of tree roots. * When merge_across_nodes knob is set to 1, there are only two rb-trees for * stable and unstable pages from all nodes with roots in index 0. Otherwise, * every node has its own stable and unstable tree. */ static inline int get_kpfn_nid(unsigned long kpfn) { return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); } static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup, struct rb_root *root) { struct ksm_stable_node *chain = alloc_stable_node(); VM_BUG_ON(is_stable_node_chain(dup)); if (likely(chain)) { INIT_HLIST_HEAD(&chain->hlist); chain->chain_prune_time = jiffies; chain->rmap_hlist_len = STABLE_NODE_CHAIN; #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) chain->nid = NUMA_NO_NODE; /* debug */ #endif ksm_stable_node_chains++; /* * Put the stable node chain in the first dimension of * the stable tree and at the same time remove the old * stable node. */ rb_replace_node(&dup->node, &chain->node, root); /* * Move the old stable node to the second dimension * queued in the hlist_dup. The invariant is that all * dup stable_nodes in the chain->hlist point to pages * that are write protected and have the exact same * content. */ stable_node_chain_add_dup(dup, chain); } return chain; } static inline void free_stable_node_chain(struct ksm_stable_node *chain, struct rb_root *root) { rb_erase(&chain->node, root); free_stable_node(chain); ksm_stable_node_chains--; } static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node) { struct ksm_rmap_item *rmap_item; /* check it's not STABLE_NODE_CHAIN or negative */ BUG_ON(stable_node->rmap_hlist_len < 0); hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { if (rmap_item->hlist.next) { ksm_pages_sharing--; trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm); } else { ksm_pages_shared--; } rmap_item->mm->ksm_merging_pages--; VM_BUG_ON(stable_node->rmap_hlist_len <= 0); stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; cond_resched(); } /* * We need the second aligned pointer of the migrate_nodes * list_head to stay clear from the rb_parent_color union * (aligned and different than any node) and also different * from &migrate_nodes. This will verify that future list.h changes * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it. */ BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes); BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1); trace_ksm_remove_ksm_page(stable_node->kpfn); if (stable_node->head == &migrate_nodes) list_del(&stable_node->list); else stable_node_dup_del(stable_node); free_stable_node(stable_node); } enum ksm_get_folio_flags { KSM_GET_FOLIO_NOLOCK, KSM_GET_FOLIO_LOCK, KSM_GET_FOLIO_TRYLOCK }; /* * ksm_get_folio: checks if the page indicated by the stable node * is still its ksm page, despite having held no reference to it. * In which case we can trust the content of the page, and it * returns the gotten page; but if the page has now been zapped, * remove the stale node from the stable tree and return NULL. * But beware, the stable node's page might be being migrated. * * You would expect the stable_node to hold a reference to the ksm page. * But if it increments the page's count, swapping out has to wait for * ksmd to come around again before it can free the page, which may take * seconds or even minutes: much too unresponsive. So instead we use a * "keyhole reference": access to the ksm page from the stable node peeps * out through its keyhole to see if that page still holds the right key, * pointing back to this stable node. This relies on freeing a PageAnon * page to reset its page->mapping to NULL, and relies on no other use of * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node, enum ksm_get_folio_flags flags) { struct folio *folio; void *expected_mapping; unsigned long kpfn; expected_mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); again: kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */ folio = pfn_folio(kpfn); if (READ_ONCE(folio->mapping) != expected_mapping) goto stale; /* * We cannot do anything with the page while its refcount is 0. * Usually 0 means free, or tail of a higher-order page: in which * case this node is no longer referenced, and should be freed; * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; * the same is in reuse_ksm_page() case; but if page is swapcache * in folio_migrate_mapping(), it might still be our page, * in which case it's essential to keep the node. */ while (!folio_try_get(folio)) { /* * Another check for folio->mapping != expected_mapping * would work here too. We have chosen to test the * swapcache flag to optimize the common case, when the * folio is or is about to be freed: the swapcache flag * is cleared (under spin_lock_irq) in the ref_freeze * section of __remove_mapping(); but anon folio->mapping * is reset to NULL later, in free_pages_prepare(). */ if (!folio_test_swapcache(folio)) goto stale; cpu_relax(); } if (READ_ONCE(folio->mapping) != expected_mapping) { folio_put(folio); goto stale; } if (flags == KSM_GET_FOLIO_TRYLOCK) { if (!folio_trylock(folio)) { folio_put(folio); return ERR_PTR(-EBUSY); } } else if (flags == KSM_GET_FOLIO_LOCK) folio_lock(folio); if (flags != KSM_GET_FOLIO_NOLOCK) { if (READ_ONCE(folio->mapping) != expected_mapping) { folio_unlock(folio); folio_put(folio); goto stale; } } return folio; stale: /* * We come here from above when folio->mapping or the swapcache flag * suggests that the node is stale; but it might be under migration. * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(), * before checking whether node->kpfn has been changed. */ smp_rmb(); if (READ_ONCE(stable_node->kpfn) != kpfn) goto again; remove_node_from_stable_tree(stable_node); return NULL; } /* * Removing rmap_item from stable or unstable tree. * This function will clean the information from the stable/unstable tree. */ static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item) { if (rmap_item->address & STABLE_FLAG) { struct ksm_stable_node *stable_node; struct folio *folio; stable_node = rmap_item->head; folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK); if (!folio) goto out; hlist_del(&rmap_item->hlist); folio_unlock(folio); folio_put(folio); if (!hlist_empty(&stable_node->hlist)) ksm_pages_sharing--; else ksm_pages_shared--; rmap_item->mm->ksm_merging_pages--; VM_BUG_ON(stable_node->rmap_hlist_len <= 0); stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->head = NULL; rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { unsigned char age; /* * Usually ksmd can and must skip the rb_erase, because * root_unstable_tree was already reset to RB_ROOT. * But be careful when an mm is exiting: do the rb_erase * if this rmap_item was inserted by this scan, rather * than left over from before. */ age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); BUG_ON(age > 1); if (!age) rb_erase(&rmap_item->node, root_unstable_tree + NUMA(rmap_item->nid)); ksm_pages_unshared--; rmap_item->address &= PAGE_MASK; } out: cond_resched(); /* we're called from many long loops */ } static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) { while (*rmap_list) { struct ksm_rmap_item *rmap_item = *rmap_list; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); } } /* * Though it's very tempting to unmerge rmap_items from stable tree rather * than check every pte of a given vma, the locking doesn't quite work for * that - an rmap_item is assigned to the stable tree after inserting ksm * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing * rmap_items from parent to child at fork time (so as not to waste time * if exit comes before the next scan reaches it). * * Similarly, although we'd like to remove rmap_items (so updating counts * and freeing memory) when unmerging an area, it's easier to leave that * to the next pass of ksmd - consider, for example, how ksmd might be * in cmp_and_merge_page on one of the rmap_items we would be removing. */ static int unmerge_ksm_pages(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool lock_vma) { unsigned long addr; int err = 0; for (addr = start; addr < end && !err; addr += PAGE_SIZE) { if (ksm_test_exit(vma->vm_mm)) break; if (signal_pending(current)) err = -ERESTARTSYS; else err = break_ksm(vma, addr, lock_vma); } return err; } static inline struct ksm_stable_node *folio_stable_node(const struct folio *folio) { return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; } static inline struct ksm_stable_node *page_stable_node(struct page *page) { return folio_stable_node(page_folio(page)); } static inline void folio_set_stable_node(struct folio *folio, struct ksm_stable_node *stable_node) { VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio); folio->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); } #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: */ static int remove_stable_node(struct ksm_stable_node *stable_node) { struct folio *folio; int err; folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK); if (!folio) { /* * ksm_get_folio did remove_node_from_stable_tree itself. */ return 0; } /* * Page could be still mapped if this races with __mmput() running in * between ksm_exit() and exit_mmap(). Just refuse to let * merge_across_nodes/max_page_sharing be switched. */ err = -EBUSY; if (!folio_mapped(folio)) { /* * The stable node did not yet appear stale to ksm_get_folio(), * since that allows for an unmapped ksm folio to be recognized * right up until it is freed; but the node is safe to remove. * This folio might be in an LRU cache waiting to be freed, * or it might be in the swapcache (perhaps under writeback), * or it might have been removed from swapcache a moment ago. */ folio_set_stable_node(folio, NULL); remove_node_from_stable_tree(stable_node); err = 0; } folio_unlock(folio); folio_put(folio); return err; } static int remove_stable_node_chain(struct ksm_stable_node *stable_node, struct rb_root *root) { struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { VM_BUG_ON(is_stable_node_dup(stable_node)); if (remove_stable_node(stable_node)) return true; else return false; } hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { VM_BUG_ON(!is_stable_node_dup(dup)); if (remove_stable_node(dup)) return true; } BUG_ON(!hlist_empty(&stable_node->hlist)); free_stable_node_chain(stable_node, root); return false; } static int remove_all_stable_nodes(void) { struct ksm_stable_node *stable_node, *next; int nid; int err = 0; for (nid = 0; nid < ksm_nr_node_ids; nid++) { while (root_stable_tree[nid].rb_node) { stable_node = rb_entry(root_stable_tree[nid].rb_node, struct ksm_stable_node, node); if (remove_stable_node_chain(stable_node, root_stable_tree + nid)) { err = -EBUSY; break; /* proceed to next nid */ } cond_resched(); } } list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (remove_stable_node(stable_node)) err = -EBUSY; cond_resched(); } return err; } static int unmerge_and_remove_all_rmap_items(void) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int err = 0; spin_lock(&ksm_mmlist_lock); slot = list_entry(ksm_mm_head.slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); spin_unlock(&ksm_mmlist_lock); for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { VMA_ITERATOR(vmi, mm_slot->slot.mm, 0); mm = mm_slot->slot.mm; mmap_read_lock(mm); /* * Exit right away if mm is exiting to avoid lockdep issue in * the maple tree */ if (ksm_test_exit(mm)) goto mm_exiting; for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) continue; err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, false); if (err) goto error; } mm_exiting: remove_trailing_rmap_items(&mm_slot->rmap_list); mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_test_exit(mm)) { hash_del(&mm_slot->slot.hash); list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); clear_bit(MMF_VM_MERGE_ANY, &mm->flags); mmdrop(mm); } else spin_unlock(&ksm_mmlist_lock); } /* Clean up stable nodes, but don't worry if some are still busy */ remove_all_stable_nodes(); ksm_scan.seqnr = 0; return 0; error: mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = &ksm_mm_head; spin_unlock(&ksm_mmlist_lock); return err; } #endif /* CONFIG_SYSFS */ static u32 calc_checksum(struct page *page) { u32 checksum; void *addr = kmap_local_page(page); checksum = xxhash(addr, PAGE_SIZE, 0); kunmap_local(addr); return checksum; } static int write_protect_page(struct vm_area_struct *vma, struct folio *folio, pte_t *orig_pte) { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, 0, 0); int swapped; int err = -EFAULT; struct mmu_notifier_range range; bool anon_exclusive; pte_t entry; if (WARN_ON_ONCE(folio_test_large(folio))) return err; pvmw.address = page_address_in_vma(folio, folio_page(folio, 0), vma); if (pvmw.address == -EFAULT) goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); if (!page_vma_mapped_walk(&pvmw)) goto out_mn; if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) goto out_unlock; anon_exclusive = PageAnonExclusive(&folio->page); entry = ptep_get(pvmw.pte); if (pte_write(entry) || pte_dirty(entry) || anon_exclusive || mm_tlb_flush_pending(mm)) { swapped = folio_test_swapcache(folio); flush_cache_page(vma, pvmw.address, folio_pfn(folio)); /* * Ok this is tricky, when get_user_pages_fast() run it doesn't * take any lock, therefore the check that we are going to make * with the pagecount against the mapcount is racy and * O_DIRECT can happen right after the check. * So we clear the pte and flush the tlb before the check * this assure us that no O_DIRECT can happen after the check * or in the middle of the check. * * No need to notify as we are downgrading page table to read * only not changing it to point to a new page. * * See Documentation/mm/mmu_notifier.rst */ entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); /* * Check that no O_DIRECT or similar I/O is in progress on the * page */ if (folio_mapcount(folio) + 1 + swapped != folio_ref_count(folio)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, &folio->page)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } if (pte_dirty(entry)) folio_mark_dirty(folio); entry = pte_mkclean(entry); if (pte_write(entry)) entry = pte_wrprotect(entry); set_pte_at(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = entry; err = 0; out_unlock: page_vma_mapped_walk_done(&pvmw); out_mn: mmu_notifier_invalidate_range_end(&range); out: return err; } /** * replace_page - replace page in vma by new ksm page * @vma: vma that holds the pte pointing to page * @page: the page we are replacing by kpage * @kpage: the ksm page we replace page by * @orig_pte: the original value of the pte * * Returns 0 on success, -EFAULT on failure. */ static int replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage, pte_t orig_pte) { struct folio *kfolio = page_folio(kpage); struct mm_struct *mm = vma->vm_mm; struct folio *folio = page_folio(page); pmd_t *pmd; pmd_t pmde; pte_t *ptep; pte_t newpte; spinlock_t *ptl; unsigned long addr; int err = -EFAULT; struct mmu_notifier_range range; addr = page_address_in_vma(folio, page, vma); if (addr == -EFAULT) goto out; pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; /* * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() * without holding anon_vma lock for write. So when looking for a * genuine pmde (in which to find pte), test present and !THP together. */ pmde = pmdp_get_lockless(pmd); if (!pmd_present(pmde) || pmd_trans_huge(pmde)) goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!ptep) goto out_mn; if (!pte_same(ptep_get(ptep), orig_pte)) { pte_unmap_unlock(ptep, ptl); goto out_mn; } VM_BUG_ON_PAGE(PageAnonExclusive(page), page); VM_BUG_ON_FOLIO(folio_test_anon(kfolio) && PageAnonExclusive(kpage), kfolio); /* * No need to check ksm_use_zero_pages here: we can only have a * zero_page here if ksm_use_zero_pages was enabled already. */ if (!is_zero_pfn(page_to_pfn(kpage))) { folio_get(kfolio); folio_add_anon_rmap_pte(kfolio, kpage, vma, addr, RMAP_NONE); newpte = mk_pte(kpage, vma->vm_page_prot); } else { /* * Use pte_mkdirty to mark the zero page mapped by KSM, and then * we can easily track all KSM-placed zero pages by checking if * the dirty bit in zero page's PTE is set. */ newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot))); ksm_map_zero_page(mm); /* * We're replacing an anonymous page with a zero page, which is * not anonymous. We need to do proper accounting otherwise we * will get wrong values in /proc, and a BUG message in dmesg * when tearing down the mm. */ dec_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep))); /* * No need to notify as we are replacing a read only page with another * read only page with the same content. * * See Documentation/mm/mmu_notifier.rst */ ptep_clear_flush(vma, addr, ptep); set_pte_at(mm, addr, ptep, newpte); folio_remove_rmap_pte(folio, page, vma); if (!folio_mapped(folio)) folio_free_swap(folio); folio_put(folio); pte_unmap_unlock(ptep, ptl); err = 0; out_mn: mmu_notifier_invalidate_range_end(&range); out: return err; } /* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page * @page: the PageAnon page that we want to replace with kpage * @kpage: the KSM page that we want to map instead of page, * or NULL the first time when we want to use page as kpage. * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ static int try_to_merge_one_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) { struct folio *folio = page_folio(page); pte_t orig_pte = __pte(0); int err = -EFAULT; if (page == kpage) /* ksm page forked */ return 0; if (!folio_test_anon(folio)) goto out; /* * We need the folio lock to read a stable swapcache flag in * write_protect_page(). We trylock because we don't want to wait * here - we prefer to continue scanning and merging different * pages, then come back to this page when it is unlocked. */ if (!folio_trylock(folio)) goto out; if (folio_test_large(folio)) { if (split_huge_page(page)) goto out_unlock; folio = page_folio(page); } /* * If this anonymous page is mapped only here, its pte may need * to be write-protected. If it's mapped elsewhere, all of its * ptes are necessarily already write-protected. But in either * case, we need to lock and check page_count is not raised. */ if (write_protect_page(vma, folio, &orig_pte) == 0) { if (!kpage) { /* * While we hold folio lock, upgrade folio from * anon to a NULL stable_node with the KSM flag set: * stable_tree_insert() will update stable_node. */ folio_set_stable_node(folio, NULL); folio_mark_accessed(folio); /* * Page reclaim just frees a clean folio with no dirty * ptes: make sure that the ksm page would be swapped. */ if (!folio_test_dirty(folio)) folio_mark_dirty(folio); err = 0; } else if (pages_identical(page, kpage)) err = replace_page(vma, page, kpage, orig_pte); } out_unlock: folio_unlock(folio); out: return err; } /* * This function returns 0 if the pages were merged or if they are * no longer merging candidates (e.g., VMA stale), -EFAULT otherwise. */ static int try_to_merge_with_zero_page(struct ksm_rmap_item *rmap_item, struct page *page) { struct mm_struct *mm = rmap_item->mm; int err = -EFAULT; /* * Same checksum as an empty page. We attempt to merge it with the * appropriate zero page if the user enabled this via sysfs. */ if (ksm_use_zero_pages && (rmap_item->oldchecksum == zero_checksum)) { struct vm_area_struct *vma; mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (vma) { err = try_to_merge_one_page(vma, page, ZERO_PAGE(rmap_item->address)); trace_ksm_merge_one_page( page_to_pfn(ZERO_PAGE(rmap_item->address)), rmap_item, mm, err); } else { /* * If the vma is out of date, we do not need to * continue. */ err = 0; } mmap_read_unlock(mm); } return err; } /* * try_to_merge_with_ksm_page - like try_to_merge_two_pages, * but no new kernel page is allocated: kpage must already be a ksm page. * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item, struct page *page, struct page *kpage) { struct mm_struct *mm = rmap_item->mm; struct vm_area_struct *vma; int err = -EFAULT; mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (!vma) goto out; err = try_to_merge_one_page(vma, page, kpage); if (err) goto out; /* Unstable nid is in union with stable anon_vma: remove first */ remove_rmap_item_from_tree(rmap_item); /* Must get reference to anon_vma while still holding mmap_lock */ rmap_item->anon_vma = vma->anon_vma; get_anon_vma(vma->anon_vma); out: mmap_read_unlock(mm); trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page), rmap_item, mm, err); return err; } /* * try_to_merge_two_pages - take two identical pages and prepare them * to be merged into one page. * * This function returns the kpage if we successfully merged two identical * pages into one ksm page, NULL otherwise. * * Note that this function upgrades page to ksm page: if one of the pages * is already a ksm page, try_to_merge_with_ksm_page should be used. */ static struct folio *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, struct page *page, struct ksm_rmap_item *tree_rmap_item, struct page *tree_page) { int err; err = try_to_merge_with_ksm_page(rmap_item, page, NULL); if (!err) { err = try_to_merge_with_ksm_page(tree_rmap_item, tree_page, page); /* * If that fails, we have a ksm page with only one pte * pointing to it: so break it. */ if (err) break_cow(rmap_item); } return err ? NULL : page_folio(page); } static __always_inline bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset) { VM_BUG_ON(stable_node->rmap_hlist_len < 0); /* * Check that at least one mapping still exists, otherwise * there's no much point to merge and share with this * stable_node, as the underlying tree_page of the other * sharer is going to be freed soon. */ return stable_node->rmap_hlist_len && stable_node->rmap_hlist_len + offset < ksm_max_page_sharing; } static __always_inline bool is_page_sharing_candidate(struct ksm_stable_node *stable_node) { return __is_page_sharing_candidate(stable_node, 0); } static struct folio *stable_node_dup(struct ksm_stable_node **_stable_node_dup, struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; struct folio *folio, *tree_folio = NULL; int found_rmap_hlist_len; if (!prune_stale_stable_nodes || time_before(jiffies, stable_node->chain_prune_time + msecs_to_jiffies( ksm_stable_node_chains_prune_millisecs))) prune_stale_stable_nodes = false; else stable_node->chain_prune_time = jiffies; hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { cond_resched(); /* * We must walk all stable_node_dup to prune the stale * stable nodes during lookup. * * ksm_get_folio can drop the nodes from the * stable_node->hlist if they point to freed pages * (that's why we do a _safe walk). The "dup" * stable_node parameter itself will be freed from * under us if it returns NULL. */ folio = ksm_get_folio(dup, KSM_GET_FOLIO_NOLOCK); if (!folio) continue; /* Pick the best candidate if possible. */ if (!found || (is_page_sharing_candidate(dup) && (!is_page_sharing_candidate(found) || dup->rmap_hlist_len > found_rmap_hlist_len))) { if (found) folio_put(tree_folio); found = dup; found_rmap_hlist_len = found->rmap_hlist_len; tree_folio = folio; /* skip put_page for found candidate */ if (!prune_stale_stable_nodes && is_page_sharing_candidate(found)) break; continue; } folio_put(folio); } if (found) { if (hlist_is_singular_node(&found->hlist_dup, &stable_node->hlist)) { /* * If there's not just one entry it would * corrupt memory, better BUG_ON. In KSM * context with no lock held it's not even * fatal. */ BUG_ON(stable_node->hlist.first->next); /* * There's just one entry and it is below the * deduplication limit so drop the chain. */ rb_replace_node(&stable_node->node, &found->node, root); free_stable_node(stable_node); ksm_stable_node_chains--; ksm_stable_node_dups--; /* * NOTE: the caller depends on the stable_node * to be equal to stable_node_dup if the chain * was collapsed. */ *_stable_node = found; /* * Just for robustness, as stable_node is * otherwise left as a stable pointer, the * compiler shall optimize it away at build * time. */ stable_node = NULL; } else if (stable_node->hlist.first != &found->hlist_dup && __is_page_sharing_candidate(found, 1)) { /* * If the found stable_node dup can accept one * more future merge (in addition to the one * that is underway) and is not at the head of * the chain, put it there so next search will * be quicker in the !prune_stale_stable_nodes * case. * * NOTE: it would be inaccurate to use nr > 1 * instead of checking the hlist.first pointer * directly, because in the * prune_stale_stable_nodes case "nr" isn't * the position of the found dup in the chain, * but the total number of dups in the chain. */ hlist_del(&found->hlist_dup); hlist_add_head(&found->hlist_dup, &stable_node->hlist); } } else { /* Its hlist must be empty if no one found. */ free_stable_node_chain(stable_node, root); } *_stable_node_dup = found; return tree_folio; } /* * Like for ksm_get_folio, this function can free the *_stable_node and * *_stable_node_dup if the returned tree_page is NULL. * * It can also free and overwrite *_stable_node with the found * stable_node_dup if the chain is collapsed (in which case * *_stable_node will be equal to *_stable_node_dup like if the chain * never existed). It's up to the caller to verify tree_page is not * NULL before dereferencing *_stable_node or *_stable_node_dup. * * *_stable_node_dup is really a second output parameter of this * function and will be overwritten in all cases, the caller doesn't * need to initialize it. */ static struct folio *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { struct ksm_stable_node *stable_node = *_stable_node; if (!is_stable_node_chain(stable_node)) { *_stable_node_dup = stable_node; return ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK); } return stable_node_dup(_stable_node_dup, _stable_node, root, prune_stale_stable_nodes); } static __always_inline struct folio *chain_prune(struct ksm_stable_node **s_n_d, struct ksm_stable_node **s_n, struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, true); } static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d, struct ksm_stable_node **s_n, struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, false); } /* * stable_tree_search - search for page inside the stable tree * * This function checks if there is a page inside the stable tree * with identical content to the page that we are scanning right now. * * This function returns the stable tree node of identical content if found, * -EBUSY if the stable node's page is being migrated, NULL otherwise. */ static struct folio *stable_tree_search(struct page *page) { int nid; struct rb_root *root; struct rb_node **new; struct rb_node *parent; struct ksm_stable_node *stable_node, *stable_node_dup; struct ksm_stable_node *page_node; struct folio *folio; folio = page_folio(page); page_node = folio_stable_node(folio); if (page_node && page_node->head != &migrate_nodes) { /* ksm page forked */ folio_get(folio); return folio; } nid = get_kpfn_nid(folio_pfn(folio)); root = root_stable_tree + nid; again: new = &root->rb_node; parent = NULL; while (*new) { struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); tree_folio = chain_prune(&stable_node_dup, &stable_node, root); if (!tree_folio) { /* * If we walked over a stale stable_node, * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate * false negative insertions just because some * stable_node was stale. */ goto again; } ret = memcmp_pages(page, &tree_folio->page); folio_put(tree_folio); parent = *new; if (ret < 0) new = &parent->rb_left; else if (ret > 0) new = &parent->rb_right; else { if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); /* * If the mapcount of our migrated KSM folio is * at most 1, we can merge it with another * KSM folio where we know that we have space * for one more mapping without exceeding the * ksm_max_page_sharing limit: see * chain_prune(). This way, we can avoid adding * this stable node to the chain. */ if (folio_mapcount(folio) > 1) goto chain_append; } if (!is_page_sharing_candidate(stable_node_dup)) { /* * If the stable_node is a chain and * we got a payload match in memcmp * but we cannot merge the scanned * page in any of the existing * stable_node dups because they're * all full, we need to wait the * scanned page to find itself a match * in the unstable tree to create a * brand new KSM page to add later to * the dups of this stable_node. */ return NULL; } /* * Lock and unlock the stable_node's page (which * might already have been migrated) so that page * migration is sure to notice its raised count. * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ tree_folio = ksm_get_folio(stable_node_dup, KSM_GET_FOLIO_TRYLOCK); if (PTR_ERR(tree_folio) == -EBUSY) return ERR_PTR(-EBUSY); if (unlikely(!tree_folio)) /* * The tree may have been rebalanced, * so re-evaluate parent and new. */ goto again; folio_unlock(tree_folio); if (get_kpfn_nid(stable_node_dup->kpfn) != NUMA(stable_node_dup->nid)) { folio_put(tree_folio); goto replace; } return tree_folio; } } if (!page_node) return NULL; list_del(&page_node->list); DO_NUMA(page_node->nid = nid); rb_link_node(&page_node->node, parent, new); rb_insert_color(&page_node->node, root); out: if (is_page_sharing_candidate(page_node)) { folio_get(folio); return folio; } else return NULL; replace: /* * If stable_node was a chain and chain_prune collapsed it, * stable_node has been updated to be the new regular * stable_node. A collapse of the chain is indistinguishable * from the case there was no chain in the stable * rbtree. Otherwise stable_node is the chain and * stable_node_dup is the dup to replace. */ if (stable_node_dup == stable_node) { VM_BUG_ON(is_stable_node_chain(stable_node_dup)); VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* there is no chain */ if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); rb_replace_node(&stable_node_dup->node, &page_node->node, root); if (is_page_sharing_candidate(page_node)) folio_get(folio); else folio = NULL; } else { rb_erase(&stable_node_dup->node, root); folio = NULL; } } else { VM_BUG_ON(!is_stable_node_chain(stable_node)); __stable_node_dup_del(stable_node_dup); if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); stable_node_chain_add_dup(page_node, stable_node); if (is_page_sharing_candidate(page_node)) folio_get(folio); else folio = NULL; } else { folio = NULL; } } stable_node_dup->head = &migrate_nodes; list_add(&stable_node_dup->list, stable_node_dup->head); return folio; chain_append: /* * If stable_node was a chain and chain_prune collapsed it, * stable_node has been updated to be the new regular * stable_node. A collapse of the chain is indistinguishable * from the case there was no chain in the stable * rbtree. Otherwise stable_node is the chain and * stable_node_dup is the dup to replace. */ if (stable_node_dup == stable_node) { VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* chain is missing so create it */ stable_node = alloc_stable_node_chain(stable_node_dup, root); if (!stable_node) return NULL; } /* * Add this stable_node dup that was * migrated to the stable_node chain * of the current nid for this page * content. */ VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); stable_node_chain_add_dup(page_node, stable_node); goto out; } /* * stable_tree_insert - insert stable tree node pointing to new ksm page * into the stable tree. * * This function returns the stable tree node just allocated on success, * NULL otherwise. */ static struct ksm_stable_node *stable_tree_insert(struct folio *kfolio) { int nid; unsigned long kpfn; struct rb_root *root; struct rb_node **new; struct rb_node *parent; struct ksm_stable_node *stable_node, *stable_node_dup; bool need_chain = false; kpfn = folio_pfn(kfolio); nid = get_kpfn_nid(kpfn); root = root_stable_tree + nid; again: parent = NULL; new = &root->rb_node; while (*new) { struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); tree_folio = chain(&stable_node_dup, &stable_node, root); if (!tree_folio) { /* * If we walked over a stale stable_node, * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate * false negative insertions just because some * stable_node was stale. */ goto again; } ret = memcmp_pages(&kfolio->page, &tree_folio->page); folio_put(tree_folio); parent = *new; if (ret < 0) new = &parent->rb_left; else if (ret > 0) new = &parent->rb_right; else { need_chain = true; break; } } stable_node_dup = alloc_stable_node(); if (!stable_node_dup) return NULL; INIT_HLIST_HEAD(&stable_node_dup->hlist); stable_node_dup->kpfn = kpfn; stable_node_dup->rmap_hlist_len = 0; DO_NUMA(stable_node_dup->nid = nid); if (!need_chain) { rb_link_node(&stable_node_dup->node, parent, new); rb_insert_color(&stable_node_dup->node, root); } else { if (!is_stable_node_chain(stable_node)) { struct ksm_stable_node *orig = stable_node; /* chain is missing so create it */ stable_node = alloc_stable_node_chain(orig, root); if (!stable_node) { free_stable_node(stable_node_dup); return NULL; } } stable_node_chain_add_dup(stable_node_dup, stable_node); } folio_set_stable_node(kfolio, stable_node_dup); return stable_node_dup; } /* * unstable_tree_search_insert - search for identical page, * else insert rmap_item into the unstable tree. * * This function searches for a page in the unstable tree identical to the * page currently being scanned; and if no identical page is found in the * tree, we insert rmap_item as a new object into the unstable tree. * * This function returns pointer to rmap_item found to be identical * to the currently scanned page, NULL otherwise. * * This function does both searching and inserting, because they share * the same walking algorithm in an rbtree. */ static struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item, struct page *page, struct page **tree_pagep) { struct rb_node **new; struct rb_root *root; struct rb_node *parent = NULL; int nid; nid = get_kpfn_nid(page_to_pfn(page)); root = root_unstable_tree + nid; new = &root->rb_node; while (*new) { struct ksm_rmap_item *tree_rmap_item; struct page *tree_page; int ret; cond_resched(); tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node); tree_page = get_mergeable_page(tree_rmap_item); if (!tree_page) return NULL; /* * Don't substitute a ksm page for a forked page. */ if (page == tree_page) { put_page(tree_page); return NULL; } ret = memcmp_pages(page, tree_page); parent = *new; if (ret < 0) { put_page(tree_page); new = &parent->rb_left; } else if (ret > 0) { put_page(tree_page); new = &parent->rb_right; } else if (!ksm_merge_across_nodes && page_to_nid(tree_page) != nid) { /* * If tree_page has been migrated to another NUMA node, * it will be flushed out and put in the right unstable * tree next time: only merge with it when across_nodes. */ put_page(tree_page); return NULL; } else { *tree_pagep = tree_page; return tree_rmap_item; } } rmap_item->address |= UNSTABLE_FLAG; rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); DO_NUMA(rmap_item->nid = nid); rb_link_node(&rmap_item->node, parent, new); rb_insert_color(&rmap_item->node, root); ksm_pages_unshared++; return NULL; } /* * stable_tree_append - add another rmap_item to the linked list of * rmap_items hanging off a given node of the stable tree, all sharing * the same ksm page. */ static void stable_tree_append(struct ksm_rmap_item *rmap_item, struct ksm_stable_node *stable_node, bool max_page_sharing_bypass) { /* * rmap won't find this mapping if we don't insert the * rmap_item in the right stable_node * duplicate. page_migration could break later if rmap breaks, * so we can as well crash here. We really need to check for * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check * for other negative values as an underflow if detected here * for the first time (and not when decreasing rmap_hlist_len) * would be sign of memory corruption in the stable_node. */ BUG_ON(stable_node->rmap_hlist_len < 0); stable_node->rmap_hlist_len++; if (!max_page_sharing_bypass) /* possibly non fatal but unexpected overflow, only warn */ WARN_ON_ONCE(stable_node->rmap_hlist_len > ksm_max_page_sharing); rmap_item->head = stable_node; rmap_item->address |= STABLE_FLAG; hlist_add_head(&rmap_item->hlist, &stable_node->hlist); if (rmap_item->hlist.next) ksm_pages_sharing++; else ksm_pages_shared++; rmap_item->mm->ksm_merging_pages++; } /* * cmp_and_merge_page - first see if page can be merged into the stable tree; * if not, compare checksum to previous and if it's the same, see if page can * be inserted into the unstable tree, or merged with a page already there and * both transferred to the stable tree. * * @page: the page that we are searching identical page to. * @rmap_item: the reverse mapping into the virtual address of this page */ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct ksm_stable_node *stable_node; struct folio *kfolio; unsigned int checksum; int err; bool max_page_sharing_bypass = false; stable_node = page_stable_node(page); if (stable_node) { if (stable_node->head != &migrate_nodes && get_kpfn_nid(READ_ONCE(stable_node->kpfn)) != NUMA(stable_node->nid)) { stable_node_dup_del(stable_node); stable_node->head = &migrate_nodes; list_add(&stable_node->list, stable_node->head); } if (stable_node->head != &migrate_nodes && rmap_item->head == stable_node) return; /* * If it's a KSM fork, allow it to go over the sharing limit * without warnings. */ if (!is_page_sharing_candidate(stable_node)) max_page_sharing_bypass = true; } else { remove_rmap_item_from_tree(rmap_item); /* * If the hash value of the page has changed from the last time * we calculated it, this page is changing frequently: therefore we * don't want to insert it in the unstable tree, and we don't want * to waste our time searching for something identical to it there. */ checksum = calc_checksum(page); if (rmap_item->oldchecksum != checksum) { rmap_item->oldchecksum = checksum; return; } if (!try_to_merge_with_zero_page(rmap_item, page)) return; } /* Start by searching for the folio in the stable tree */ kfolio = stable_tree_search(page); if (&kfolio->page == page && rmap_item->head == stable_node) { folio_put(kfolio); return; } remove_rmap_item_from_tree(rmap_item); if (kfolio) { if (kfolio == ERR_PTR(-EBUSY)) return; err = try_to_merge_with_ksm_page(rmap_item, page, &kfolio->page); if (!err) { /* * The page was successfully merged: * add its rmap_item to the stable tree. */ folio_lock(kfolio); stable_tree_append(rmap_item, folio_stable_node(kfolio), max_page_sharing_bypass); folio_unlock(kfolio); } folio_put(kfolio); return; } tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { bool split; kfolio = try_to_merge_two_pages(rmap_item, page, tree_rmap_item, tree_page); /* * If both pages we tried to merge belong to the same compound * page, then we actually ended up increasing the reference * count of the same compound page twice, and split_huge_page * failed. * Here we set a flag if that happened, and we use it later to * try split_huge_page again. Since we call put_page right * afterwards, the reference count will be correct and * split_huge_page should succeed. */ split = PageTransCompound(page) && compound_head(page) == compound_head(tree_page); put_page(tree_page); if (kfolio) { /* * The pages were successfully merged: insert new * node in the stable tree and add both rmap_items. */ folio_lock(kfolio); stable_node = stable_tree_insert(kfolio); if (stable_node) { stable_tree_append(tree_rmap_item, stable_node, false); stable_tree_append(rmap_item, stable_node, false); } folio_unlock(kfolio); /* * If we fail to insert the page into the stable tree, * we will have 2 virtual addresses that are pointing * to a ksm page left outside the stable tree, * in which case we need to break_cow on both. */ if (!stable_node) { break_cow(tree_rmap_item); break_cow(rmap_item); } } else if (split) { /* * We are here if we tried to merge two pages and * failed because they both belonged to the same * compound page. We will split the page now, but no * merging will take place. * We do not want to add the cost of a full lock; if * the page is locked, it is better to skip it and * perhaps try again later. */ if (!trylock_page(page)) return; split_huge_page(page); unlock_page(page); } } } static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, struct ksm_rmap_item **rmap_list, unsigned long addr) { struct ksm_rmap_item *rmap_item; while (*rmap_list) { rmap_item = *rmap_list; if ((rmap_item->address & PAGE_MASK) == addr) return rmap_item; if (rmap_item->address > addr) break; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); } rmap_item = alloc_rmap_item(); if (rmap_item) { /* It has already been zeroed */ rmap_item->mm = mm_slot->slot.mm; rmap_item->mm->ksm_rmap_items++; rmap_item->address = addr; rmap_item->rmap_list = *rmap_list; *rmap_list = rmap_item; } return rmap_item; } /* * Calculate skip age for the ksm page age. The age determines how often * de-duplicating has already been tried unsuccessfully. If the age is * smaller, the scanning of this page is skipped for less scans. * * @age: rmap_item age of page */ static unsigned int skip_age(rmap_age_t age) { if (age <= 3) return 1; if (age <= 5) return 2; if (age <= 8) return 4; return 8; } /* * Determines if a page should be skipped for the current scan. * * @folio: folio containing the page to check * @rmap_item: associated rmap_item of page */ static bool should_skip_rmap_item(struct folio *folio, struct ksm_rmap_item *rmap_item) { rmap_age_t age; if (!ksm_smart_scan) return false; /* * Never skip pages that are already KSM; pages cmp_and_merge_page() * will essentially ignore them, but we still have to process them * properly. */ if (folio_test_ksm(folio)) return false; age = rmap_item->age; if (age != U8_MAX) rmap_item->age++; /* * Smaller ages are not skipped, they need to get a chance to go * through the different phases of the KSM merging. */ if (age < 3) return false; /* * Are we still allowed to skip? If not, then don't skip it * and determine how much more often we are allowed to skip next. */ if (!rmap_item->remaining_skips) { rmap_item->remaining_skips = skip_age(age); return false; } /* Skip this page */ ksm_pages_skipped++; rmap_item->remaining_skips--; remove_rmap_item_from_tree(rmap_item); return true; } static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; struct ksm_mm_slot *mm_slot; struct mm_slot *slot; struct vm_area_struct *vma; struct ksm_rmap_item *rmap_item; struct vma_iterator vmi; int nid; if (list_empty(&ksm_mm_head.slot.mm_node)) return NULL; mm_slot = ksm_scan.mm_slot; if (mm_slot == &ksm_mm_head) { advisor_start_scan(); trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items); /* * A number of pages can hang around indefinitely in per-cpu * LRU cache, raised page count preventing write_protect_page * from merging them. Though it doesn't really matter much, * it is puzzling to see some stuck in pages_volatile until * other activity jostles them out, and they also prevented * LTP's KSM test from succeeding deterministically; so drain * them here (here rather than on entry to ksm_do_scan(), * so we don't IPI too often when pages_to_scan is set low). */ lru_add_drain_all(); /* * Whereas stale stable_nodes on the stable_tree itself * get pruned in the regular course of stable_tree_search(), * those moved out to the migrate_nodes list can accumulate: * so prune them once before each full scan. */ if (!ksm_merge_across_nodes) { struct ksm_stable_node *stable_node, *next; struct folio *folio; list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK); if (folio) folio_put(folio); cond_resched(); } } for (nid = 0; nid < ksm_nr_node_ids; nid++) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); ksm_scan.mm_slot = mm_slot; spin_unlock(&ksm_mmlist_lock); /* * Although we tested list_empty() above, a racing __ksm_exit * of the last mm on the list may have removed it since then. */ if (mm_slot == &ksm_mm_head) return NULL; next_mm: ksm_scan.address = 0; ksm_scan.rmap_list = &mm_slot->rmap_list; } slot = &mm_slot->slot; mm = slot->mm; vma_iter_init(&vmi, mm, ksm_scan.address); mmap_read_lock(mm); if (ksm_test_exit(mm)) goto no_vmas; for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE)) continue; if (ksm_scan.address < vma->vm_start) ksm_scan.address = vma->vm_start; if (!vma->anon_vma) ksm_scan.address = vma->vm_end; while (ksm_scan.address < vma->vm_end) { struct page *tmp_page = NULL; struct folio_walk fw; struct folio *folio; if (ksm_test_exit(mm)) break; folio = folio_walk_start(&fw, vma, ksm_scan.address, 0); if (folio) { if (!folio_is_zone_device(folio) && folio_test_anon(folio)) { folio_get(folio); tmp_page = fw.page; } folio_walk_end(&fw, vma); } if (tmp_page) { flush_anon_page(vma, tmp_page, ksm_scan.address); flush_dcache_page(tmp_page); rmap_item = get_next_rmap_item(mm_slot, ksm_scan.rmap_list, ksm_scan.address); if (rmap_item) { ksm_scan.rmap_list = &rmap_item->rmap_list; if (should_skip_rmap_item(folio, rmap_item)) { folio_put(folio); goto next_page; } ksm_scan.address += PAGE_SIZE; *page = tmp_page; } else { folio_put(folio); } mmap_read_unlock(mm); return rmap_item; } next_page: ksm_scan.address += PAGE_SIZE; cond_resched(); } } if (ksm_test_exit(mm)) { no_vmas: ksm_scan.address = 0; ksm_scan.rmap_list = &mm_slot->rmap_list; } /* * Nuke all the rmap_items that are above this current rmap: * because there were no VM_MERGEABLE vmas with such addresses. */ remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_lock * throughout, and found no VM_MERGEABLE: so do the same as * __ksm_exit does to remove this mm from all our lists now. * This applies either when cleaning up after __ksm_exit * (but beware: we can reach here even before __ksm_exit), * or when all VM_MERGEABLE areas have been unmapped (and * mmap_lock then protects against race with MADV_MERGEABLE). */ hash_del(&mm_slot->slot.hash); list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); clear_bit(MMF_VM_MERGE_ANY, &mm->flags); mmap_read_unlock(mm); mmdrop(mm); } else { mmap_read_unlock(mm); /* * mmap_read_unlock(mm) first because after * spin_unlock(&ksm_mmlist_lock) run, the "mm" may * already have been freed under us by __ksm_exit() * because the "mm_slot" is still hashed and * ksm_scan.mm_slot doesn't point to it anymore. */ spin_unlock(&ksm_mmlist_lock); } /* Repeat until we've completed scanning the whole list */ mm_slot = ksm_scan.mm_slot; if (mm_slot != &ksm_mm_head) goto next_mm; advisor_stop_scan(); trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items); ksm_scan.seqnr++; return NULL; } /** * ksm_do_scan - the ksm scanner main worker function. * @scan_npages: number of pages we want to scan before we return. */ static void ksm_do_scan(unsigned int scan_npages) { struct ksm_rmap_item *rmap_item; struct page *page; while (scan_npages-- && likely(!freezing(current))) { cond_resched(); rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) return; cmp_and_merge_page(page, rmap_item); put_page(page); ksm_pages_scanned++; } } static int ksmd_should_run(void) { return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node); } static int ksm_scan_thread(void *nothing) { unsigned int sleep_ms; set_freezable(); set_user_nice(current, 5); while (!kthread_should_stop()) { mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksmd_should_run()) ksm_do_scan(ksm_thread_pages_to_scan); mutex_unlock(&ksm_thread_mutex); if (ksmd_should_run()) { sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); wait_event_freezable_timeout(ksm_iter_wait, sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), msecs_to_jiffies(sleep_ms)); } else { wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); } } return 0; } static void __ksm_add_vma(struct vm_area_struct *vma) { unsigned long vm_flags = vma->vm_flags; if (vm_flags & VM_MERGEABLE) return; if (vma_ksm_compatible(vma)) vm_flags_set(vma, VM_MERGEABLE); } static int __ksm_del_vma(struct vm_area_struct *vma) { int err; if (!(vma->vm_flags & VM_MERGEABLE)) return 0; if (vma->anon_vma) { err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true); if (err) return err; } vm_flags_clear(vma, VM_MERGEABLE); return 0; } /** * ksm_add_vma - Mark vma as mergeable if compatible * * @vma: Pointer to vma */ void ksm_add_vma(struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) __ksm_add_vma(vma); } static void ksm_add_vmas(struct mm_struct *mm) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) __ksm_add_vma(vma); } static int ksm_del_vmas(struct mm_struct *mm) { struct vm_area_struct *vma; int err; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { err = __ksm_del_vma(vma); if (err) return err; } return 0; } /** * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all * compatible VMA's * * @mm: Pointer to mm * * Returns 0 on success, otherwise error code */ int ksm_enable_merge_any(struct mm_struct *mm) { int err; if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return 0; if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); if (err) return err; } set_bit(MMF_VM_MERGE_ANY, &mm->flags); ksm_add_vmas(mm); return 0; } /** * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm, * previously enabled via ksm_enable_merge_any(). * * Disabling merging implies unmerging any merged pages, like setting * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and * merging on all compatible VMA's remains enabled. * * @mm: Pointer to mm * * Returns 0 on success, otherwise error code */ int ksm_disable_merge_any(struct mm_struct *mm) { int err; if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return 0; err = ksm_del_vmas(mm); if (err) { ksm_add_vmas(mm); return err; } clear_bit(MMF_VM_MERGE_ANY, &mm->flags); return 0; } int ksm_disable(struct mm_struct *mm) { mmap_assert_write_locked(mm); if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) return 0; if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return ksm_disable_merge_any(mm); return ksm_del_vmas(mm); } int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; int err; switch (advice) { case MADV_MERGEABLE: if (vma->vm_flags & VM_MERGEABLE) return 0; if (!vma_ksm_compatible(vma)) return 0; if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); if (err) return err; } *vm_flags |= VM_MERGEABLE; break; case MADV_UNMERGEABLE: if (!(*vm_flags & VM_MERGEABLE)) return 0; /* just ignore the advice */ if (vma->anon_vma) { err = unmerge_ksm_pages(vma, start, end, true); if (err) return err; } *vm_flags &= ~VM_MERGEABLE; break; } return 0; } EXPORT_SYMBOL_GPL(ksm_madvise); int __ksm_enter(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; int needs_wakeup; mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return -ENOMEM; slot = &mm_slot->slot; /* Check ksm_run too? Would need tighter locking */ needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node); spin_lock(&ksm_mmlist_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* * When KSM_RUN_MERGE (or KSM_RUN_STOP), * insert just behind the scanning cursor, to let the area settle * down a little; when fork is followed by immediate exec, we don't * want ksmd to waste time setting up and tearing down an rmap_list. * * But when KSM_RUN_UNMERGE, it's important to insert ahead of its * scanning cursor, otherwise KSM pages in newly forked mms will be * missed: then we might as well insert at the end of the list. */ if (ksm_run & KSM_RUN_UNMERGE) list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node); else list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); mmgrab(mm); if (needs_wakeup) wake_up_interruptible(&ksm_thread_wait); trace_ksm_enter(mm); return 0; } void __ksm_exit(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; int easy_to_free = 0; /* * This process is exiting: if it's straightforward (as is the * case when ksmd was never running), free mm_slot immediately. * But if it's at the cursor or has rmap_items linked to it, use * mmap_lock to synchronize with any break_cows before pagetables * are freed, and leave the mm_slot on the list for ksmd to free. * Beware: ksm may already have noticed it exiting and freed the slot. */ spin_lock(&ksm_mmlist_lock); slot = mm_slot_lookup(mm_slots_hash, mm); mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { hash_del(&slot->hash); list_del(&slot->mm_node); easy_to_free = 1; } else { list_move(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); } } spin_unlock(&ksm_mmlist_lock); if (easy_to_free) { mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGE_ANY, &mm->flags); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmdrop(mm); } else if (mm_slot) { mmap_write_lock(mm); mmap_write_unlock(mm); } trace_ksm_exit(mm); } struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { struct page *page = folio_page(folio, 0); struct anon_vma *anon_vma = folio_anon_vma(folio); struct folio *new_folio; if (folio_test_large(folio)) return folio; if (folio_test_ksm(folio)) { if (folio_stable_node(folio) && !(ksm_run & KSM_RUN_UNMERGE)) return folio; /* no need to copy it */ } else if (!anon_vma) { return folio; /* no need to copy it */ } else if (folio->index == linear_page_index(vma, addr) && anon_vma->root == vma->anon_vma->root) { return folio; /* still no need to copy it */ } if (PageHWPoison(page)) return ERR_PTR(-EHWPOISON); if (!folio_test_uptodate(folio)) return folio; /* let do_swap_page report the error */ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (new_folio && mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) { folio_put(new_folio); new_folio = NULL; } if (new_folio) { if (copy_mc_user_highpage(folio_page(new_folio, 0), page, addr, vma)) { folio_put(new_folio); return ERR_PTR(-EHWPOISON); } folio_set_dirty(new_folio); __folio_mark_uptodate(new_folio); __folio_set_locked(new_folio); #ifdef CONFIG_SWAP count_vm_event(KSM_SWPIN_COPY); #endif } return new_folio; } void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { struct ksm_stable_node *stable_node; struct ksm_rmap_item *rmap_item; int search_new_forks = 0; VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio); /* * Rely on the page lock to protect against concurrent modifications * to that page's node of the stable tree. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); stable_node = folio_stable_node(folio); if (!stable_node) return; again: hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; struct anon_vma_chain *vmac; struct vm_area_struct *vma; cond_resched(); if (!anon_vma_trylock_read(anon_vma)) { if (rwc->try_lock) { rwc->contended = true; return; } anon_vma_lock_read(anon_vma); } anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { unsigned long addr; cond_resched(); vma = vmac->vma; /* Ignore the stable/unstable/sqnr flags */ addr = rmap_item->address & PAGE_MASK; if (addr < vma->vm_start || addr >= vma->vm_end) continue; /* * Initially we examine only the vma which covers this * rmap_item; but later, if there is still work to do, * we examine covering vmas in other mms: in case they * were forked from the original since ksmd passed. */ if ((rmap_item->mm == vma->vm_mm) == search_new_forks) continue; if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) { anon_vma_unlock_read(anon_vma); return; } if (rwc->done && rwc->done(folio)) { anon_vma_unlock_read(anon_vma); return; } } anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; } #ifdef CONFIG_MEMORY_FAILURE /* * Collect processes when the error hit an ksm page. */ void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early) { struct ksm_stable_node *stable_node; struct ksm_rmap_item *rmap_item; struct vm_area_struct *vma; struct task_struct *tsk; stable_node = folio_stable_node(folio); if (!stable_node) return; hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *av = rmap_item->anon_vma; anon_vma_lock_read(av); rcu_read_lock(); for_each_process(tsk) { struct anon_vma_chain *vmac; unsigned long addr; struct task_struct *t = task_early_kill(tsk, force_early); if (!t) continue; anon_vma_interval_tree_foreach(vmac, &av->rb_root, 0, ULONG_MAX) { vma = vmac->vma; if (vma->vm_mm == t->mm) { addr = rmap_item->address & PAGE_MASK; add_to_kill_ksm(t, page, vma, to_kill, addr); } } } rcu_read_unlock(); anon_vma_unlock_read(av); } } #endif #ifdef CONFIG_MIGRATION void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) { struct ksm_stable_node *stable_node; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio); VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio); stable_node = folio_stable_node(folio); if (stable_node) { VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio); stable_node->kpfn = folio_pfn(newfolio); /* * newfolio->mapping was set in advance; now we need smp_wmb() * to make sure that the new stable_node->kpfn is visible * to ksm_get_folio() before it can see that folio->mapping * has gone stale (or that the swapcache flag has been cleared). */ smp_wmb(); folio_set_stable_node(folio, NULL); } } #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_HOTREMOVE static void wait_while_offlining(void) { while (ksm_run & KSM_RUN_OFFLINE) { mutex_unlock(&ksm_thread_mutex); wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), TASK_UNINTERRUPTIBLE); mutex_lock(&ksm_thread_mutex); } } static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn) { if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) { /* * Don't ksm_get_folio, page has already gone: * which is why we keep kpfn instead of page* */ remove_node_from_stable_tree(stable_node); return true; } return false; } static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn, struct rb_root *root) { struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { VM_BUG_ON(is_stable_node_dup(stable_node)); return stable_node_dup_remove_range(stable_node, start_pfn, end_pfn); } hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { VM_BUG_ON(!is_stable_node_dup(dup)); stable_node_dup_remove_range(dup, start_pfn, end_pfn); } if (hlist_empty(&stable_node->hlist)) { free_stable_node_chain(stable_node, root); return true; /* notify caller that tree was rebalanced */ } else return false; } static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { struct ksm_stable_node *stable_node, *next; struct rb_node *node; int nid; for (nid = 0; nid < ksm_nr_node_ids; nid++) { node = rb_first(root_stable_tree + nid); while (node) { stable_node = rb_entry(node, struct ksm_stable_node, node); if (stable_node_chain_remove_range(stable_node, start_pfn, end_pfn, root_stable_tree + nid)) node = rb_first(root_stable_tree + nid); else node = rb_next(node); cond_resched(); } } list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) remove_node_from_stable_tree(stable_node); cond_resched(); } } static int ksm_memory_callback(struct notifier_block *self, unsigned long action, void *arg) { struct memory_notify *mn = arg; switch (action) { case MEM_GOING_OFFLINE: /* * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() * and remove_all_stable_nodes() while memory is going offline: * it is unsafe for them to touch the stable tree at this time. * But unmerge_ksm_pages(), rmap lookups and other entry points * which do not need the ksm_thread_mutex are all safe. */ mutex_lock(&ksm_thread_mutex); ksm_run |= KSM_RUN_OFFLINE; mutex_unlock(&ksm_thread_mutex); break; case MEM_OFFLINE: /* * Most of the work is done by page migration; but there might * be a few stable_nodes left over, still pointing to struct * pages which have been offlined: prune those from the tree, * otherwise ksm_get_folio() might later try to access a * non-existent struct page. */ ksm_check_stable_tree(mn->start_pfn, mn->start_pfn + mn->nr_pages); fallthrough; case MEM_CANCEL_OFFLINE: mutex_lock(&ksm_thread_mutex); ksm_run &= ~KSM_RUN_OFFLINE; mutex_unlock(&ksm_thread_mutex); smp_mb(); /* wake_up_bit advises this */ wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE)); break; } return NOTIFY_OK; } #else static void wait_while_offlining(void) { } #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_PROC_FS /* * The process is mergeable only if any VMA is currently * applicable to KSM. * * The mmap lock must be held in read mode. */ bool ksm_process_mergeable(struct mm_struct *mm) { struct vm_area_struct *vma; mmap_assert_locked(mm); VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) if (vma->vm_flags & VM_MERGEABLE) return true; return false; } long ksm_process_profit(struct mm_struct *mm) { return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE - mm->ksm_rmap_items * sizeof(struct ksm_rmap_item); } #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSFS /* * This all compiles without CONFIG_SYSFS, but is a waste of space. */ #define KSM_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define KSM_ATTR(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RW(_name) static ssize_t sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs); } static ssize_t sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; ksm_thread_sleep_millisecs = msecs; wake_up_interruptible(&ksm_iter_wait); return count; } KSM_ATTR(sleep_millisecs); static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan); } static ssize_t pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int nr_pages; int err; if (ksm_advisor != KSM_ADVISOR_NONE) return -EINVAL; err = kstrtouint(buf, 10, &nr_pages); if (err) return -EINVAL; ksm_thread_pages_to_scan = nr_pages; return count; } KSM_ATTR(pages_to_scan); static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_run); } static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int flags; int err; err = kstrtouint(buf, 10, &flags); if (err) return -EINVAL; if (flags > KSM_RUN_UNMERGE) return -EINVAL; /* * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, * breaking COW to free the pages_shared (but leaves mm_slots * on the list for when ksmd may be set running again). */ mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_run != flags) { ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { set_current_oom_origin(); err = unmerge_and_remove_all_rmap_items(); clear_current_oom_origin(); if (err) { ksm_run = KSM_RUN_STOP; count = err; } } } mutex_unlock(&ksm_thread_mutex); if (flags & KSM_RUN_MERGE) wake_up_interruptible(&ksm_thread_wait); return count; } KSM_ATTR(run); #ifdef CONFIG_NUMA static ssize_t merge_across_nodes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes); } static ssize_t merge_across_nodes_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long knob; err = kstrtoul(buf, 10, &knob); if (err) return err; if (knob > 1) return -EINVAL; mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_merge_across_nodes != knob) { if (ksm_pages_shared || remove_all_stable_nodes()) err = -EBUSY; else if (root_stable_tree == one_stable_tree) { struct rb_root *buf; /* * This is the first time that we switch away from the * default of merging across nodes: must now allocate * a buffer to hold as many roots as may be needed. * Allocate stable and unstable together: * MAXSMP NODES_SHIFT 10 will use 16kB. */ buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf), GFP_KERNEL); /* Let us assume that RB_ROOT is NULL is zero */ if (!buf) err = -ENOMEM; else { root_stable_tree = buf; root_unstable_tree = buf + nr_node_ids; /* Stable tree is empty but not the unstable */ root_unstable_tree[0] = one_unstable_tree[0]; } } if (!err) { ksm_merge_across_nodes = knob; ksm_nr_node_ids = knob ? 1 : nr_node_ids; } } mutex_unlock(&ksm_thread_mutex); return err ? err : count; } KSM_ATTR(merge_across_nodes); #endif static ssize_t use_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_use_zero_pages); } static ssize_t use_zero_pages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; bool value; err = kstrtobool(buf, &value); if (err) return -EINVAL; ksm_use_zero_pages = value; return count; } KSM_ATTR(use_zero_pages); static ssize_t max_page_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_max_page_sharing); } static ssize_t max_page_sharing_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; int knob; err = kstrtoint(buf, 10, &knob); if (err) return err; /* * When a KSM page is created it is shared by 2 mappings. This * being a signed comparison, it implicitly verifies it's not * negative. */ if (knob < 2) return -EINVAL; if (READ_ONCE(ksm_max_page_sharing) == knob) return count; mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_max_page_sharing != knob) { if (ksm_pages_shared || remove_all_stable_nodes()) err = -EBUSY; else ksm_max_page_sharing = knob; } mutex_unlock(&ksm_thread_mutex); return err ? err : count; } KSM_ATTR(max_page_sharing); static ssize_t pages_scanned_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_scanned); } KSM_ATTR_RO(pages_scanned); static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_shared); } KSM_ATTR_RO(pages_shared); static ssize_t pages_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_sharing); } KSM_ATTR_RO(pages_sharing); static ssize_t pages_unshared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_unshared); } KSM_ATTR_RO(pages_unshared); static ssize_t pages_volatile_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { long ksm_pages_volatile; ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared - ksm_pages_sharing - ksm_pages_unshared; /* * It was not worth any locking to calculate that statistic, * but it might therefore sometimes be negative: conceal that. */ if (ksm_pages_volatile < 0) ksm_pages_volatile = 0; return sysfs_emit(buf, "%ld\n", ksm_pages_volatile); } KSM_ATTR_RO(pages_volatile); static ssize_t pages_skipped_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_skipped); } KSM_ATTR_RO(pages_skipped); static ssize_t ksm_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%ld\n", atomic_long_read(&ksm_zero_pages)); } KSM_ATTR_RO(ksm_zero_pages); static ssize_t general_profit_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { long general_profit; general_profit = (ksm_pages_sharing + atomic_long_read(&ksm_zero_pages)) * PAGE_SIZE - ksm_rmap_items * sizeof(struct ksm_rmap_item); return sysfs_emit(buf, "%ld\n", general_profit); } KSM_ATTR_RO(general_profit); static ssize_t stable_node_dups_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups); } KSM_ATTR_RO(stable_node_dups); static ssize_t stable_node_chains_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains); } KSM_ATTR_RO(stable_node_chains); static ssize_t stable_node_chains_prune_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); } static ssize_t stable_node_chains_prune_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; ksm_stable_node_chains_prune_millisecs = msecs; return count; } KSM_ATTR(stable_node_chains_prune_millisecs); static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr); } KSM_ATTR_RO(full_scans); static ssize_t smart_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_smart_scan); } static ssize_t smart_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; bool value; err = kstrtobool(buf, &value); if (err) return -EINVAL; ksm_smart_scan = value; return count; } KSM_ATTR(smart_scan); static ssize_t advisor_mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { const char *output; if (ksm_advisor == KSM_ADVISOR_NONE) output = "[none] scan-time"; else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) output = "none [scan-time]"; return sysfs_emit(buf, "%s\n", output); } static ssize_t advisor_mode_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { enum ksm_advisor_type curr_advisor = ksm_advisor; if (sysfs_streq("scan-time", buf)) ksm_advisor = KSM_ADVISOR_SCAN_TIME; else if (sysfs_streq("none", buf)) ksm_advisor = KSM_ADVISOR_NONE; else return -EINVAL; /* Set advisor default values */ if (curr_advisor != ksm_advisor) set_advisor_defaults(); return count; } KSM_ATTR(advisor_mode); static ssize_t advisor_max_cpu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_advisor_max_cpu); } static ssize_t advisor_max_cpu_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_max_cpu = value; return count; } KSM_ATTR(advisor_max_cpu); static ssize_t advisor_min_pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_min_pages_to_scan); } static ssize_t advisor_min_pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_min_pages_to_scan = value; return count; } KSM_ATTR(advisor_min_pages_to_scan); static ssize_t advisor_max_pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_max_pages_to_scan); } static ssize_t advisor_max_pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_max_pages_to_scan = value; return count; } KSM_ATTR(advisor_max_pages_to_scan); static ssize_t advisor_target_scan_time_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_target_scan_time); } static ssize_t advisor_target_scan_time_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; if (value < 1) return -EINVAL; ksm_advisor_target_scan_time = value; return count; } KSM_ATTR(advisor_target_scan_time); static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, &run_attr.attr, &pages_scanned_attr.attr, &pages_shared_attr.attr, &pages_sharing_attr.attr, &pages_unshared_attr.attr, &pages_volatile_attr.attr, &pages_skipped_attr.attr, &ksm_zero_pages_attr.attr, &full_scans_attr.attr, #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, #endif &max_page_sharing_attr.attr, &stable_node_chains_attr.attr, &stable_node_dups_attr.attr, &stable_node_chains_prune_millisecs_attr.attr, &use_zero_pages_attr.attr, &general_profit_attr.attr, &smart_scan_attr.attr, &advisor_mode_attr.attr, &advisor_max_cpu_attr.attr, &advisor_min_pages_to_scan_attr.attr, &advisor_max_pages_to_scan_attr.attr, &advisor_target_scan_time_attr.attr, NULL, }; static const struct attribute_group ksm_attr_group = { .attrs = ksm_attrs, .name = "ksm", }; #endif /* CONFIG_SYSFS */ static int __init ksm_init(void) { struct task_struct *ksm_thread; int err; /* The correct value depends on page size and endianness */ zero_checksum = calc_checksum(ZERO_PAGE(0)); /* Default to false for backwards compatibility */ ksm_use_zero_pages = false; err = ksm_slab_init(); if (err) goto out; ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); if (IS_ERR(ksm_thread)) { pr_err("ksm: creating kthread failed\n"); err = PTR_ERR(ksm_thread); goto out_free; } #ifdef CONFIG_SYSFS err = sysfs_create_group(mm_kobj, &ksm_attr_group); if (err) { pr_err("ksm: register sysfs failed\n"); kthread_stop(ksm_thread); goto out_free; } #else ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ #endif /* CONFIG_SYSFS */ #ifdef CONFIG_MEMORY_HOTREMOVE /* There is no significance to this priority 100 */ hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI); #endif return 0; out_free: ksm_slab_free(); out: return err; } subsys_initcall(ksm_init); |
| 17 17 16 9 15 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 | /* * net/tipc/discover.c * * Copyright (c) 2003-2006, 2014-2018, Ericsson AB * Copyright (c) 2005-2006, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "node.h" #include "discover.h" /* min delay during bearer start up */ #define TIPC_DISC_INIT msecs_to_jiffies(125) /* max delay if bearer has no links */ #define TIPC_DISC_FAST msecs_to_jiffies(1000) /* max delay if bearer has links */ #define TIPC_DISC_SLOW msecs_to_jiffies(60000) /* indicates no timer in use */ #define TIPC_DISC_INACTIVE 0xffffffff /** * struct tipc_discoverer - information about an ongoing link setup request * @bearer_id: identity of bearer issuing requests * @net: network namespace instance * @dest: destination address for request messages * @domain: network domain to which links can be established * @num_nodes: number of nodes currently discovered (i.e. with an active link) * @lock: spinlock for controlling access to requests * @skb: request message to be (repeatedly) sent * @timer: timer governing period between requests * @timer_intv: current interval between requests (in ms) */ struct tipc_discoverer { u32 bearer_id; struct tipc_media_addr dest; struct net *net; u32 domain; int num_nodes; spinlock_t lock; struct sk_buff *skb; struct timer_list timer; unsigned long timer_intv; }; /** * tipc_disc_init_msg - initialize a link setup message * @net: the applicable net namespace * @skb: buffer containing message * @mtyp: message type (request or response) * @b: ptr to bearer issuing message */ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, u32 mtyp, struct tipc_bearer *b) { struct tipc_net *tn = tipc_net(net); u32 dest_domain = b->domain; struct tipc_msg *hdr; hdr = buf_msg(skb); tipc_msg_init(tn->trial_addr, hdr, LINK_CONFIG, mtyp, MAX_H_SIZE, dest_domain); msg_set_size(hdr, MAX_H_SIZE + NODE_ID_LEN); msg_set_non_seq(hdr, 1); msg_set_node_sig(hdr, tn->random); msg_set_node_capabilities(hdr, TIPC_NODE_CAPABILITIES); msg_set_dest_domain(hdr, dest_domain); msg_set_bc_netid(hdr, tn->net_id); b->media->addr2msg(msg_media_addr(hdr), &b->addr); msg_set_peer_net_hash(hdr, tipc_net_hash_mixes(net, tn->random)); msg_set_node_id(hdr, tipc_own_id(net)); } static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst, u32 src, u32 sugg_addr, struct tipc_media_addr *maddr, struct tipc_bearer *b) { struct tipc_msg *hdr; struct sk_buff *skb; skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC); if (!skb) return; hdr = buf_msg(skb); tipc_disc_init_msg(net, skb, mtyp, b); msg_set_sugg_node_addr(hdr, sugg_addr); msg_set_dest_domain(hdr, dst); tipc_bearer_xmit_skb(net, b->identity, skb, maddr); } /** * disc_dupl_alert - issue node address duplication alert * @b: pointer to bearer detecting duplication * @node_addr: duplicated node address * @media_addr: media address advertised by duplicated node */ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr, struct tipc_media_addr *media_addr) { char media_addr_str[64]; tipc_media_addr_printf(media_addr_str, sizeof(media_addr_str), media_addr); pr_warn("Duplicate %x using %s seen on <%s>\n", node_addr, media_addr_str, b->name); } /* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer * Returns true if message should be dropped by caller, i.e., if it is a * trial message or we are inside trial period. Otherwise false. */ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, struct tipc_media_addr *maddr, struct tipc_bearer *b, u32 dst, u32 src, u32 sugg_addr, u8 *peer_id, int mtyp) { struct net *net = d->net; struct tipc_net *tn = tipc_net(net); u32 self = tipc_own_addr(net); bool trial = time_before(jiffies, tn->addr_trial_end) && !self; if (mtyp == DSC_TRIAL_FAIL_MSG) { if (!trial) return true; /* Ignore if somebody else already gave new suggestion */ if (dst != tn->trial_addr) return true; /* Otherwise update trial address and restart trial period */ tn->trial_addr = sugg_addr; msg_set_prevnode(buf_msg(d->skb), sugg_addr); tn->addr_trial_end = jiffies + msecs_to_jiffies(1000); return true; } /* Apply trial address if we just left trial period */ if (!trial && !self) { schedule_work(&tn->work); msg_set_prevnode(buf_msg(d->skb), tn->trial_addr); msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); } /* Accept regular link requests/responses only after trial period */ if (mtyp != DSC_TRIAL_MSG) return trial; sugg_addr = tipc_node_try_addr(net, peer_id, src); if (sugg_addr) tipc_disc_msg_xmit(net, DSC_TRIAL_FAIL_MSG, src, self, sugg_addr, maddr, b); return true; } /** * tipc_disc_rcv - handle incoming discovery message (request or response) * @net: applicable net namespace * @skb: buffer containing message * @b: bearer that message arrived on */ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct tipc_net *tn = tipc_net(net); struct tipc_msg *hdr = buf_msg(skb); u32 pnet_hash = msg_peer_net_hash(hdr); u16 caps = msg_node_capabilities(hdr); bool legacy = tn->legacy_addr_format; u32 sugg = msg_sugg_node_addr(hdr); u32 signature = msg_node_sig(hdr); u8 peer_id[NODE_ID_LEN] = {0,}; u32 dst = msg_dest_domain(hdr); u32 net_id = msg_bc_netid(hdr); struct tipc_media_addr maddr; u32 src = msg_prevnode(hdr); u32 mtyp = msg_type(hdr); bool dupl_addr = false; bool respond = false; u32 self; int err; if (skb_linearize(skb)) { kfree_skb(skb); return; } hdr = buf_msg(skb); if (caps & TIPC_NODE_ID128) memcpy(peer_id, msg_node_id(hdr), NODE_ID_LEN); else sprintf(peer_id, "%x", src); err = b->media->msg2addr(b, &maddr, msg_media_addr(hdr)); kfree_skb(skb); if (err || maddr.broadcast) { pr_warn_ratelimited("Rcv corrupt discovery message\n"); return; } /* Ignore discovery messages from own node */ if (!memcmp(&maddr, &b->addr, sizeof(maddr))) return; if (net_id != tn->net_id) return; if (tipc_disc_addr_trial_msg(b->disc, &maddr, b, dst, src, sugg, peer_id, mtyp)) return; self = tipc_own_addr(net); /* Message from somebody using this node's address */ if (in_own_node(net, src)) { disc_dupl_alert(b, self, &maddr); return; } if (!tipc_in_scope(legacy, dst, self)) return; if (!tipc_in_scope(legacy, b->domain, src)) return; tipc_node_check_dest(net, src, peer_id, b, caps, signature, pnet_hash, &maddr, &respond, &dupl_addr); if (dupl_addr) disc_dupl_alert(b, src, &maddr); if (!respond) return; if (mtyp != DSC_REQ_MSG) return; tipc_disc_msg_xmit(net, DSC_RESP_MSG, src, self, 0, &maddr, b); } /* tipc_disc_add_dest - increment set of discovered nodes */ void tipc_disc_add_dest(struct tipc_discoverer *d) { spin_lock_bh(&d->lock); d->num_nodes++; spin_unlock_bh(&d->lock); } /* tipc_disc_remove_dest - decrement set of discovered nodes */ void tipc_disc_remove_dest(struct tipc_discoverer *d) { int intv, num; spin_lock_bh(&d->lock); d->num_nodes--; num = d->num_nodes; intv = d->timer_intv; if (!num && (intv == TIPC_DISC_INACTIVE || intv > TIPC_DISC_FAST)) { d->timer_intv = TIPC_DISC_INIT; mod_timer(&d->timer, jiffies + d->timer_intv); } spin_unlock_bh(&d->lock); } /* tipc_disc_timeout - send a periodic link setup request * Called whenever a link setup request timer associated with a bearer expires. * - Keep doubling time between sent request until limit is reached; * - Hold at fast polling rate if we don't have any associated nodes * - Otherwise hold at slow polling rate */ static void tipc_disc_timeout(struct timer_list *t) { struct tipc_discoverer *d = from_timer(d, t, timer); struct tipc_net *tn = tipc_net(d->net); struct tipc_media_addr maddr; struct sk_buff *skb = NULL; struct net *net = d->net; u32 bearer_id; spin_lock_bh(&d->lock); /* Stop searching if only desired node has been found */ if (tipc_node(d->domain) && d->num_nodes) { d->timer_intv = TIPC_DISC_INACTIVE; goto exit; } /* Did we just leave trial period ? */ if (!time_before(jiffies, tn->addr_trial_end) && !tipc_own_addr(net)) { mod_timer(&d->timer, jiffies + TIPC_DISC_INIT); spin_unlock_bh(&d->lock); schedule_work(&tn->work); return; } /* Adjust timeout interval according to discovery phase */ if (time_before(jiffies, tn->addr_trial_end)) { d->timer_intv = TIPC_DISC_INIT; } else { d->timer_intv *= 2; if (d->num_nodes && d->timer_intv > TIPC_DISC_SLOW) d->timer_intv = TIPC_DISC_SLOW; else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST) d->timer_intv = TIPC_DISC_FAST; msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); msg_set_prevnode(buf_msg(d->skb), tn->trial_addr); } mod_timer(&d->timer, jiffies + d->timer_intv); memcpy(&maddr, &d->dest, sizeof(maddr)); skb = skb_clone(d->skb, GFP_ATOMIC); bearer_id = d->bearer_id; exit: spin_unlock_bh(&d->lock); if (skb) tipc_bearer_xmit_skb(net, bearer_id, skb, &maddr); } /** * tipc_disc_create - create object to send periodic link setup requests * @net: the applicable net namespace * @b: ptr to bearer issuing requests * @dest: destination address for request messages * @skb: pointer to created frame * * Return: 0 if successful, otherwise -errno. */ int tipc_disc_create(struct net *net, struct tipc_bearer *b, struct tipc_media_addr *dest, struct sk_buff **skb) { struct tipc_net *tn = tipc_net(net); struct tipc_discoverer *d; d = kmalloc(sizeof(*d), GFP_ATOMIC); if (!d) return -ENOMEM; d->skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC); if (!d->skb) { kfree(d); return -ENOMEM; } tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b); /* Do we need an address trial period first ? */ if (!tipc_own_addr(net)) { tn->addr_trial_end = jiffies + msecs_to_jiffies(1000); msg_set_type(buf_msg(d->skb), DSC_TRIAL_MSG); } memcpy(&d->dest, dest, sizeof(*dest)); d->net = net; d->bearer_id = b->identity; d->domain = b->domain; d->num_nodes = 0; d->timer_intv = TIPC_DISC_INIT; spin_lock_init(&d->lock); timer_setup(&d->timer, tipc_disc_timeout, 0); mod_timer(&d->timer, jiffies + d->timer_intv); b->disc = d; *skb = skb_clone(d->skb, GFP_ATOMIC); return 0; } /** * tipc_disc_delete - destroy object sending periodic link setup requests * @d: ptr to link dest structure */ void tipc_disc_delete(struct tipc_discoverer *d) { timer_shutdown_sync(&d->timer); kfree_skb(d->skb); kfree(d); } /** * tipc_disc_reset - reset object to send periodic link setup requests * @net: the applicable net namespace * @b: ptr to bearer issuing requests */ void tipc_disc_reset(struct net *net, struct tipc_bearer *b) { struct tipc_discoverer *d = b->disc; struct tipc_media_addr maddr; struct sk_buff *skb; spin_lock_bh(&d->lock); tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b); d->net = net; d->bearer_id = b->identity; d->domain = b->domain; d->num_nodes = 0; d->timer_intv = TIPC_DISC_INIT; memcpy(&maddr, &d->dest, sizeof(maddr)); mod_timer(&d->timer, jiffies + d->timer_intv); skb = skb_clone(d->skb, GFP_ATOMIC); spin_unlock_bh(&d->lock); if (skb) tipc_bearer_xmit_skb(net, b->identity, skb, &maddr); } |
| 2 18 2 9 9 7 1 12 1 4 9 13 1 3 4 21 21 19 2 2 1 6 13 18 18 1 1 1 4 2 3 3 3 2 1 2 2 1 2 1 1 1 1 88 87 86 8 39 1 5 46 50 71 70 4 29 16 24 1 13 13 13 4 4 5 5 26 26 21 6 5 5 5 29 29 2 5 5 5 2 5 29 28 2 2 2 29 2 2 2 2 2 1 1 1 1 25 85 85 57 29 2 2 29 29 26 25 29 29 29 35 3 32 6 8 15 12 1 3 1 13 4 9 13 18 16 18 7 15 2 17 10 9 18 25 15 25 3 3 81 81 50 32 18 3 3 3 3 3 3 3 3 18 13 1 8 87 82 40 2 7 7 90 14 11 5 20 14 11 70 3 13 70 3 13 4 4 5 2 5 1 1 24 2 90 2 1 1 1 1 3 87 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 | // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <crypto/sha2.h> #include <net/tcp.h> #include <net/mptcp.h> #include "protocol.h" #include "mib.h" #include <trace/events/mptcp.h> static bool mptcp_cap_flag_sha256(u8 flags) { return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; } static void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, int opsize, struct mptcp_options_received *mp_opt) { u8 subtype = *ptr >> 4; int expected_opsize; u16 subopt; u8 version; u8 flags; u8 i; switch (subtype) { case MPTCPOPT_MP_CAPABLE: /* strict size checking */ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { if (skb->len > tcp_hdr(skb)->doff << 2) expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; else expected_opsize = TCPOLEN_MPTCP_MPC_ACK; subopt = OPTION_MPTCP_MPC_ACK; } else { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) { expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; subopt = OPTION_MPTCP_MPC_SYNACK; } else { expected_opsize = TCPOLEN_MPTCP_MPC_SYN; subopt = OPTION_MPTCP_MPC_SYN; } } /* Cfr RFC 8684 Section 3.3.0: * If a checksum is present but its use had * not been negotiated in the MP_CAPABLE handshake, the receiver MUST * close the subflow with a RST, as it is not behaving as negotiated. * If a checksum is not present when its use has been negotiated, the * receiver MUST close the subflow with a RST, as it is considered * broken * We parse even option with mismatching csum presence, so that * later in subflow_data_ready we can trigger the reset. */ if (opsize != expected_opsize && (expected_opsize != TCPOLEN_MPTCP_MPC_ACK_DATA || opsize != TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM)) break; /* try to be gentle vs future versions on the initial syn */ version = *ptr++ & MPTCP_VERSION_MASK; if (opsize != TCPOLEN_MPTCP_MPC_SYN) { if (version != MPTCP_SUPPORTED_VERSION) break; } else if (version < MPTCP_SUPPORTED_VERSION) { break; } flags = *ptr++; if (!mptcp_cap_flag_sha256(flags) || (flags & MPTCP_CAP_EXTENSIBILITY)) break; /* RFC 6824, Section 3.1: * "For the Checksum Required bit (labeled "A"), if either * host requires the use of checksums, checksums MUST be used. * In other words, the only way for checksums not to be used * is if both hosts in their SYNs set A=0." */ if (flags & MPTCP_CAP_CHECKSUM_REQD) mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; mp_opt->deny_join_id0 = !!(flags & MPTCP_CAP_DENY_JOIN_ID0); mp_opt->suboptions |= subopt; if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { mp_opt->sndr_key = get_unaligned_be64(ptr); ptr += 8; } if (opsize >= TCPOLEN_MPTCP_MPC_ACK) { mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; } if (opsize >= TCPOLEN_MPTCP_MPC_ACK_DATA) { /* Section 3.1.: * "the data parameters in a MP_CAPABLE are semantically * equivalent to those in a DSS option and can be used * interchangeably." */ mp_opt->suboptions |= OPTION_MPTCP_DSS; mp_opt->use_map = 1; mp_opt->mpc_map = 1; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; } if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) { mp_opt->csum = get_unaligned((__force __sum16 *)ptr); mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; ptr += 2; } pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u\n", version, flags, opsize, mp_opt->sndr_key, mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum); break; case MPTCPOPT_MP_JOIN: if (opsize == TCPOLEN_MPTCP_MPJ_SYN) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYN; mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; mp_opt->token = get_unaligned_be32(ptr); ptr += 4; mp_opt->nonce = get_unaligned_be32(ptr); ptr += 4; pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u\n", mp_opt->backup, mp_opt->join_id, mp_opt->token, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYNACK; mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; mp_opt->thmac = get_unaligned_be64(ptr); ptr += 8; mp_opt->nonce = get_unaligned_be32(ptr); ptr += 4; pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u\n", mp_opt->backup, mp_opt->join_id, mp_opt->thmac, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_ACK; ptr += 2; memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); pr_debug("MP_JOIN hmac\n"); } break; case MPTCPOPT_DSS: pr_debug("DSS\n"); ptr++; flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0; mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d\n", mp_opt->data_fin, mp_opt->dsn64, mp_opt->use_map, mp_opt->ack64, mp_opt->use_ack); expected_opsize = TCPOLEN_MPTCP_DSS_BASE; if (mp_opt->use_ack) { if (mp_opt->ack64) expected_opsize += TCPOLEN_MPTCP_DSS_ACK64; else expected_opsize += TCPOLEN_MPTCP_DSS_ACK32; } if (mp_opt->use_map) { if (mp_opt->dsn64) expected_opsize += TCPOLEN_MPTCP_DSS_MAP64; else expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; } /* Always parse any csum presence combination, we will enforce * RFC 8684 Section 3.3.0 checks later in subflow_data_ready */ if (opsize != expected_opsize && opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) break; mp_opt->suboptions |= OPTION_MPTCP_DSS; if (mp_opt->use_ack) { if (mp_opt->ack64) { mp_opt->data_ack = get_unaligned_be64(ptr); ptr += 8; } else { mp_opt->data_ack = get_unaligned_be32(ptr); ptr += 4; } pr_debug("data_ack=%llu\n", mp_opt->data_ack); } if (mp_opt->use_map) { if (mp_opt->dsn64) { mp_opt->data_seq = get_unaligned_be64(ptr); ptr += 8; } else { mp_opt->data_seq = get_unaligned_be32(ptr); ptr += 4; } mp_opt->subflow_seq = get_unaligned_be32(ptr); ptr += 4; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) { mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; mp_opt->csum = get_unaligned((__force __sum16 *)ptr); ptr += 2; } pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n", mp_opt->data_seq, mp_opt->subflow_seq, mp_opt->data_len, !!(mp_opt->suboptions & OPTION_MPTCP_CSUMREQD), mp_opt->csum); } break; case MPTCPOPT_ADD_ADDR: mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO; if (!mp_opt->echo) { if (opsize == TCPOLEN_MPTCP_ADD_ADDR || opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT) mp_opt->addr.family = AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 || opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT) mp_opt->addr.family = AF_INET6; #endif else break; } else { if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE || opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) mp_opt->addr.family = AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE || opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) mp_opt->addr.family = AF_INET6; #endif else break; } mp_opt->suboptions |= OPTION_MPTCP_ADD_ADDR; mp_opt->addr.id = *ptr++; mp_opt->addr.port = 0; mp_opt->ahmac = 0; if (mp_opt->addr.family == AF_INET) { memcpy((u8 *)&mp_opt->addr.addr.s_addr, (u8 *)ptr, 4); ptr += 4; if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT || opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) { mp_opt->addr.port = htons(get_unaligned_be16(ptr)); ptr += 2; } } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else { memcpy(mp_opt->addr.addr6.s6_addr, (u8 *)ptr, 16); ptr += 16; if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT || opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) { mp_opt->addr.port = htons(get_unaligned_be16(ptr)); ptr += 2; } } #endif if (!mp_opt->echo) { mp_opt->ahmac = get_unaligned_be64(ptr); ptr += 8; } pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d\n", (mp_opt->addr.family == AF_INET6) ? "6" : "", mp_opt->addr.id, mp_opt->ahmac, mp_opt->echo, ntohs(mp_opt->addr.port)); break; case MPTCPOPT_RM_ADDR: if (opsize < TCPOLEN_MPTCP_RM_ADDR_BASE + 1 || opsize > TCPOLEN_MPTCP_RM_ADDR_BASE + MPTCP_RM_IDS_MAX) break; ptr++; mp_opt->suboptions |= OPTION_MPTCP_RM_ADDR; mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE; for (i = 0; i < mp_opt->rm_list.nr; i++) mp_opt->rm_list.ids[i] = *ptr++; pr_debug("RM_ADDR: rm_list_nr=%d\n", mp_opt->rm_list.nr); break; case MPTCPOPT_MP_PRIO: if (opsize != TCPOLEN_MPTCP_PRIO) break; mp_opt->suboptions |= OPTION_MPTCP_PRIO; mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP; pr_debug("MP_PRIO: prio=%d\n", mp_opt->backup); break; case MPTCPOPT_MP_FASTCLOSE: if (opsize != TCPOLEN_MPTCP_FASTCLOSE) break; ptr += 2; mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE; pr_debug("MP_FASTCLOSE: recv_key=%llu\n", mp_opt->rcvr_key); break; case MPTCPOPT_RST: if (opsize != TCPOLEN_MPTCP_RST) break; if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) break; mp_opt->suboptions |= OPTION_MPTCP_RST; flags = *ptr++; mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; mp_opt->reset_reason = *ptr; pr_debug("MP_RST: transient=%u reason=%u\n", mp_opt->reset_transient, mp_opt->reset_reason); break; case MPTCPOPT_MP_FAIL: if (opsize != TCPOLEN_MPTCP_FAIL) break; ptr += 2; mp_opt->suboptions |= OPTION_MPTCP_FAIL; mp_opt->fail_seq = get_unaligned_be64(ptr); pr_debug("MP_FAIL: data_seq=%llu\n", mp_opt->fail_seq); break; default: break; } } void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt) { const struct tcphdr *th = tcp_hdr(skb); const unsigned char *ptr; int length; /* Ensure that casting the whole status to u32 is efficient and safe */ BUILD_BUG_ON(sizeof_field(struct mptcp_options_received, status) != sizeof(u32)); BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct mptcp_options_received, status), sizeof(u32))); *(u32 *)&mp_opt->status = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); while (length > 0) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_MPTCP) mptcp_parse_option(skb, ptr, opsize, mp_opt); ptr += opsize - 2; length -= opsize; } } } bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* we will use snd_isn to detect first pkt [re]transmission * in mptcp_established_options_mp() */ subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; if (subflow->request_mptcp) { opts->suboptions = OPTION_MPTCP_MPC_SYN; opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk)); opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); *size = TCPOLEN_MPTCP_MPC_SYN; return true; } else if (subflow->request_join) { pr_debug("remote_token=%u, nonce=%u\n", subflow->remote_token, subflow->local_nonce); opts->suboptions = OPTION_MPTCP_MPJ_SYN; opts->join_id = subflow->local_id; opts->token = subflow->remote_token; opts->nonce = subflow->local_nonce; opts->backup = subflow->request_bkup; *size = TCPOLEN_MPTCP_MPJ_SYN; return true; } return false; } static void clear_3rdack_retransmission(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); sk_stop_timer(sk, &icsk->icsk_delack_timer); icsk->icsk_ack.timeout = 0; icsk->icsk_ack.ato = 0; icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER); } static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, bool snd_data_fin_enable, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_ext *mpext; unsigned int data_len; u8 len; /* When skb is not available, we better over-estimate the emitted * options len. A full DSS option (28 bytes) is longer than * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so * tell the caller to defer the estimate to * mptcp_established_options_dss(), which will reserve enough space. */ if (!skb) return false; /* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */ if (READ_ONCE(subflow->fully_established) || snd_data_fin_enable || subflow->snd_isn != TCP_SKB_CB(skb)->seq || sk->sk_state != TCP_ESTABLISHED) return false; if (subflow->mp_capable) { mpext = mptcp_get_ext(skb); data_len = mpext ? mpext->data_len : 0; /* we will check ops->data_len in mptcp_write_options() to * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and * TCPOLEN_MPTCP_MPC_ACK */ opts->data_len = data_len; opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; opts->csum_reqd = READ_ONCE(msk->csum_enabled); opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); /* Section 3.1. * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK * packets that start the first subflow of an MPTCP connection, * as well as the first packet that carries data */ if (data_len > 0) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) { /* we need to propagate more info to csum the pseudo hdr */ opts->data_seq = mpext->data_seq; opts->subflow_seq = mpext->subflow_seq; opts->csum = mpext->csum; len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *size = ALIGN(len, 4); } else { *size = TCPOLEN_MPTCP_MPC_ACK; } pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d\n", subflow, subflow->local_key, subflow->remote_key, data_len); return true; } else if (subflow->mp_join) { opts->suboptions = OPTION_MPTCP_MPJ_ACK; memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN); *size = TCPOLEN_MPTCP_MPJ_ACK; pr_debug("subflow=%p\n", subflow); /* we can use the full delegate action helper only from BH context * If we are in process context - sk is flushing the backlog at * socket lock release time - just set the appropriate flag, will * be handled by the release callback */ if (sock_owned_by_user(sk)) set_bit(MPTCP_DELEGATE_ACK, &subflow->delegated_status); else mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_ACK); return true; } return false; } static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_ext *ext) { /* The write_seq value has already been incremented, so the actual * sequence number for the DATA_FIN is one less. */ u64 data_fin_tx_seq = READ_ONCE(mptcp_sk(subflow->conn)->write_seq) - 1; if (!ext->use_map || !skb->len) { /* RFC6824 requires a DSS mapping with specific values * if DATA_FIN is set but no data payload is mapped */ ext->data_fin = 1; ext->use_map = 1; ext->dsn64 = 1; ext->data_seq = data_fin_tx_seq; ext->subflow_seq = 0; ext->data_len = 1; } else if (ext->data_seq + ext->data_len == data_fin_tx_seq) { /* If there's an existing DSS mapping and it is the * final mapping, DATA_FIN consumes 1 additional byte of * mapping space. */ ext->data_fin = 1; ext->data_len++; } } static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool snd_data_fin_enable, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int dss_size = 0; struct mptcp_ext *mpext; unsigned int ack_size; bool ret = false; u64 ack_seq; opts->csum_reqd = READ_ONCE(msk->csum_enabled); mpext = skb ? mptcp_get_ext(skb) : NULL; if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; if (mpext) { if (opts->csum_reqd) map_size += TCPOLEN_MPTCP_DSS_CHECKSUM; opts->ext_copy = *mpext; } dss_size = map_size; if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); opts->suboptions = OPTION_MPTCP_DSS; ret = true; } /* passive sockets msk will set the 'can_ack' after accept(), even * if the first subflow may have the already the remote key handy */ opts->ext_copy.use_ack = 0; if (!READ_ONCE(msk->can_ack)) { *size = ALIGN(dss_size, 4); return ret; } ack_seq = READ_ONCE(msk->ack_seq); if (READ_ONCE(msk->use_64bit_ack)) { ack_size = TCPOLEN_MPTCP_DSS_ACK64; opts->ext_copy.data_ack = ack_seq; opts->ext_copy.ack64 = 1; } else { ack_size = TCPOLEN_MPTCP_DSS_ACK32; opts->ext_copy.data_ack32 = (uint32_t)ack_seq; opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; opts->suboptions = OPTION_MPTCP_DSS; /* Add kind/length/subtype/flag overhead if mapping is not populated */ if (dss_size == 0) ack_size += TCPOLEN_MPTCP_DSS_BASE; dss_size += ack_size; *size = ALIGN(dss_size, 4); return true; } static u64 add_addr_generate_hmac(u64 key1, u64 key2, struct mptcp_addr_info *addr) { u16 port = ntohs(addr->port); u8 hmac[SHA256_DIGEST_SIZE]; u8 msg[19]; int i = 0; msg[i++] = addr->id; if (addr->family == AF_INET) { memcpy(&msg[i], &addr->addr.s_addr, 4); i += 4; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6) { memcpy(&msg[i], &addr->addr6.s6_addr, 16); i += 16; } #endif msg[i++] = port >> 8; msg[i++] = port & 0xFF; mptcp_crypto_hmac_sha(key1, key2, msg, i, hmac); return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]); } static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); bool drop_other_suboptions = false; unsigned int opt_size = *size; bool echo; int len; /* add addr will strip the existing options, be sure to avoid breaking * MPC/MPJ handshakes */ if (!mptcp_pm_should_add_signal(msk) || (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr, &echo, &drop_other_suboptions)) return false; /* * Later on, mptcp_write_options() will enforce mutually exclusion with * DSS, bail out if such option is set and we can't drop it. */ if (drop_other_suboptions) remaining += opt_size; else if (opts->suboptions & OPTION_MPTCP_DSS) return false; len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port); if (remaining < len) return false; *size = len; if (drop_other_suboptions) { pr_debug("drop other suboptions\n"); opts->suboptions = 0; /* note that e.g. DSS could have written into the memory * aliased by ahmac, we must reset the field here * to avoid appending the hmac even for ADD_ADDR echo * options */ opts->ahmac = 0; *size -= opt_size; } opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX); opts->ahmac = add_addr_generate_hmac(READ_ONCE(msk->local_key), READ_ONCE(msk->remote_key), &opts->addr); } else { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX); } pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d\n", opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port)); return true; } static bool mptcp_established_options_rm_addr(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_rm_list rm_list; int i, len; if (!mptcp_pm_should_rm_signal(msk) || !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list))) return false; len = mptcp_rm_addr_len(&rm_list); if (len < 0) return false; if (remaining < len) return false; *size = len; opts->suboptions |= OPTION_MPTCP_RM_ADDR; opts->rm_list = rm_list; for (i = 0; i < opts->rm_list.nr; i++) pr_debug("rm_list_ids[%d]=%d\n", i, opts->rm_list.ids[i]); MPTCP_ADD_STATS(sock_net(sk), MPTCP_MIB_RMADDRTX, opts->rm_list.nr); return true; } static bool mptcp_established_options_mp_prio(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* can't send MP_PRIO with MPC, as they share the same option space: * 'backup'. Also it makes no sense at all */ if (!subflow->send_mp_prio || (opts->suboptions & OPTIONS_MPTCP_MPC)) return false; /* account for the trailing 'nop' option */ if (remaining < TCPOLEN_MPTCP_PRIO_ALIGN) return false; *size = TCPOLEN_MPTCP_PRIO_ALIGN; opts->suboptions |= OPTION_MPTCP_PRIO; opts->backup = subflow->request_bkup; pr_debug("prio=%d\n", opts->backup); return true; } static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); if (remaining < TCPOLEN_MPTCP_RST) return false; *size = TCPOLEN_MPTCP_RST; opts->suboptions |= OPTION_MPTCP_RST; opts->reset_transient = subflow->reset_transient; opts->reset_reason = subflow->reset_reason; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX); return true; } static bool mptcp_established_options_fastclose(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); if (likely(!subflow->send_fastclose)) return false; if (remaining < TCPOLEN_MPTCP_FASTCLOSE) return false; *size = TCPOLEN_MPTCP_FASTCLOSE; opts->suboptions |= OPTION_MPTCP_FASTCLOSE; opts->rcvr_key = READ_ONCE(msk->remote_key); pr_debug("FASTCLOSE key=%llu\n", opts->rcvr_key); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); return true; } static bool mptcp_established_options_mp_fail(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); if (likely(!subflow->send_mp_fail)) return false; if (remaining < TCPOLEN_MPTCP_FAIL) return false; *size = TCPOLEN_MPTCP_FAIL; opts->suboptions |= OPTION_MPTCP_FAIL; opts->fail_seq = subflow->map_seq; pr_debug("MP_FAIL fail_seq=%llu\n", opts->fail_seq); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); return true; } bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int opt_size = 0; bool snd_data_fin; bool ret = false; opts->suboptions = 0; if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb))) return false; if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { if (mptcp_established_options_fastclose(sk, &opt_size, remaining, opts) || mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; } /* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */ if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; } return true; } snd_data_fin = mptcp_data_fin_enabled(msk); if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, opts)) { unsigned int mp_fail_size; ret = true; if (mptcp_established_options_mp_fail(sk, &mp_fail_size, remaining - opt_size, opts)) { *size += opt_size + mp_fail_size; remaining -= opt_size - mp_fail_size; return true; } } /* we reserved enough space for the above options, and exceeding the * TCP option space would be fatal */ if (WARN_ON_ONCE(opt_size > remaining)) return false; *size += opt_size; remaining -= opt_size; if (mptcp_established_options_add_addr(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } else if (mptcp_established_options_rm_addr(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } if (mptcp_established_options_mp_prio(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } return ret; } bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); if (subflow_req->mp_capable) { opts->suboptions = OPTION_MPTCP_MPC_SYNACK; opts->sndr_key = subflow_req->local_key; opts->csum_reqd = subflow_req->csum_reqd; opts->allow_join_id0 = subflow_req->allow_join_id0; *size = TCPOLEN_MPTCP_MPC_SYNACK; pr_debug("subflow_req=%p, local_key=%llu\n", subflow_req, subflow_req->local_key); return true; } else if (subflow_req->mp_join) { opts->suboptions = OPTION_MPTCP_MPJ_SYNACK; opts->backup = subflow_req->request_bkup; opts->join_id = subflow_req->local_id; opts->thmac = subflow_req->thmac; opts->nonce = subflow_req->local_nonce; pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u\n", subflow_req, opts->backup, opts->join_id, opts->thmac, opts->nonce); *size = TCPOLEN_MPTCP_MPJ_SYNACK; return true; } return false; } static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_options_received *mp_opt) { /* here we can process OoO, in-window pkts, only in-sequence 4th ack * will make the subflow fully established */ if (likely(READ_ONCE(subflow->fully_established))) { /* on passive sockets, check for 3rd ack retransmission * note that msk is always set by subflow_syn_recv_sock() * for mp_join subflows */ if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) && !subflow->request_join) tcp_send_ack(ssk); goto check_notify; } /* we must process OoO packets before the first subflow is fully * established. OoO packets are instead a protocol violation * for MP_JOIN subflows as the peer must not send any data * before receiving the forth ack - cfr. RFC 8684 section 3.2. */ if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) { if (subflow->mp_join) goto reset; if (subflow->is_mptfo && mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) goto set_fully_established; return subflow->mp_capable; } if (subflow->remote_key_valid && (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) || ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && (!mp_opt->echo || subflow->mp_join)))) { /* subflows are fully established as soon as we get any * additional ack, including ADD_ADDR. */ goto set_fully_established; } /* If the first established packet does not contain MP_CAPABLE + data * then fallback to TCP. Fallback scenarios requires a reset for * MP_JOIN subflows. */ if (!(mp_opt->suboptions & OPTIONS_MPTCP_MPC)) { if (subflow->mp_join) goto reset; subflow->mp_capable = 0; pr_fallback(msk); mptcp_do_fallback(ssk); return false; } if (mp_opt->deny_join_id0) WRITE_ONCE(msk->pm.remote_deny_join_id0, true); if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); set_fully_established: mptcp_data_lock((struct sock *)msk); __mptcp_subflow_fully_established(msk, subflow, mp_opt); mptcp_data_unlock((struct sock *)msk); check_notify: /* if the subflow is not already linked into the conn_list, we can't * notify the PM: this subflow is still on the listener queue * and the PM possibly acquiring the subflow lock could race with * the listener close */ if (likely(subflow->pm_notified) || list_empty(&subflow->node)) return true; subflow->pm_notified = 1; if (subflow->mp_join) { clear_3rdack_retransmission(ssk); mptcp_pm_subflow_established(msk); } else { mptcp_pm_fully_established(msk, ssk); } return true; reset: mptcp_subflow_reset(ssk); return false; } u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq) { u32 old_seq32, cur_seq32; old_seq32 = (u32)old_seq; cur_seq32 = (u32)cur_seq; cur_seq = (old_seq & GENMASK_ULL(63, 32)) + cur_seq32; if (unlikely(cur_seq32 < old_seq32 && before(old_seq32, cur_seq32))) return cur_seq + (1LL << 32); /* reverse wrap could happen, too */ if (unlikely(cur_seq32 > old_seq32 && after(old_seq32, cur_seq32))) return cur_seq - (1LL << 32); return cur_seq; } static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una) { msk->bytes_acked += new_snd_una - msk->snd_una; WRITE_ONCE(msk->snd_una, new_snd_una); } static void ack_update_msk(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_options_received *mp_opt) { u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt); struct sock *sk = (struct sock *)msk; u64 old_snd_una; mptcp_data_lock(sk); /* avoid ack expansion on update conflict, to reduce the risk of * wrongly expanding to a future ack sequence number, which is way * more dangerous than missing an ack */ old_snd_una = msk->snd_una; new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64); /* ACK for data not even sent yet? Ignore.*/ if (unlikely(after64(new_snd_una, snd_nxt))) new_snd_una = old_snd_una; new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; if (after64(new_wnd_end, msk->wnd_end)) WRITE_ONCE(msk->wnd_end, new_wnd_end); /* this assumes mptcp_incoming_options() is invoked after tcp_ack() */ if (after64(msk->wnd_end, snd_nxt)) __mptcp_check_push(sk, ssk); if (after64(new_snd_una, old_snd_una)) { __mptcp_snd_una_update(msk, new_snd_una); __mptcp_data_acked(sk); } msk->last_ack_recv = tcp_jiffies32; mptcp_data_unlock(sk); trace_ack_update_msk(mp_opt->data_ack, old_snd_una, new_snd_una, new_wnd_end, READ_ONCE(msk->wnd_end)); } bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) { /* Skip if DATA_FIN was already received. * If updating simultaneously with the recvmsg loop, values * should match. If they mismatch, the peer is misbehaving and * we will prefer the most recent information. */ if (READ_ONCE(msk->rcv_data_fin)) return false; WRITE_ONCE(msk->rcv_data_fin_seq, mptcp_expand_seq(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit)); WRITE_ONCE(msk->rcv_data_fin, 1); return true; } static bool add_addr_hmac_valid(struct mptcp_sock *msk, struct mptcp_options_received *mp_opt) { u64 hmac = 0; if (mp_opt->echo) return true; hmac = add_addr_generate_hmac(READ_ONCE(msk->remote_key), READ_ONCE(msk->local_key), &mp_opt->addr); pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", msk, hmac, mp_opt->ahmac); return hmac == mp_opt->ahmac; } /* Return false if a subflow has been reset, else return true */ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_options_received mp_opt; struct mptcp_ext *mpext; if (__mptcp_check_fallback(msk)) { /* Keep it simple and unconditionally trigger send data cleanup and * pending queue spooling. We will need to acquire the data lock * for more accurate checks, and once the lock is acquired, such * helpers are cheap. */ mptcp_data_lock(subflow->conn); if (sk_stream_memory_free(sk)) __mptcp_check_push(subflow->conn, sk); /* on fallback we just need to ignore the msk-level snd_una, as * this is really plain TCP */ __mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt)); __mptcp_data_acked(subflow->conn); mptcp_data_unlock(subflow->conn); return true; } mptcp_get_options(skb, &mp_opt); /* The subflow can be in close state only if check_fully_established() * just sent a reset. If so, tell the caller to ignore the current packet. */ if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return sk->sk_state != TCP_CLOSE; if (unlikely(mp_opt.suboptions != OPTION_MPTCP_DSS)) { if ((mp_opt.suboptions & OPTION_MPTCP_FASTCLOSE) && READ_ONCE(msk->local_key) == mp_opt.rcvr_key) { WRITE_ONCE(msk->rcv_fastclose, true); mptcp_schedule_work((struct sock *)msk); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSERX); } if ((mp_opt.suboptions & OPTION_MPTCP_ADD_ADDR) && add_addr_hmac_valid(msk, &mp_opt)) { if (!mp_opt.echo) { mptcp_pm_add_addr_received(sk, &mp_opt.addr); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR); } else { mptcp_pm_add_addr_echoed(msk, &mp_opt.addr); mptcp_pm_del_add_timer(msk, &mp_opt.addr, true); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD); } if (mp_opt.addr.port) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD); } if (mp_opt.suboptions & OPTION_MPTCP_RM_ADDR) mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list); if (mp_opt.suboptions & OPTION_MPTCP_PRIO) { mptcp_pm_mp_prio_received(sk, mp_opt.backup); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX); } if (mp_opt.suboptions & OPTION_MPTCP_FAIL) { mptcp_pm_mp_fail_received(sk, mp_opt.fail_seq); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILRX); } if (mp_opt.suboptions & OPTION_MPTCP_RST) { subflow->reset_seen = 1; subflow->reset_reason = mp_opt.reset_reason; subflow->reset_transient = mp_opt.reset_transient; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTRX); } if (!(mp_opt.suboptions & OPTION_MPTCP_DSS)) return true; } /* we can't wait for recvmsg() to update the ack_seq, otherwise * monodirectional flows will stuck */ if (mp_opt.use_ack) ack_update_msk(msk, sk, &mp_opt); /* Zero-data-length packets are dropped by the caller and not * propagated to the MPTCP layer, so the skb extension does not * need to be allocated or populated. DATA_FIN information, if * present, needs to be updated here before the skb is freed. */ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { if (mp_opt.data_fin && mp_opt.data_len == 1 && mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64)) mptcp_schedule_work((struct sock *)msk); return true; } mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) return true; memset(mpext, 0, sizeof(*mpext)); if (likely(mp_opt.use_map)) { if (mp_opt.mpc_map) { /* this is an MP_CAPABLE carrying MPTCP data * we know this map the first chunk of data */ mptcp_crypto_key_sha(subflow->remote_key, NULL, &mpext->data_seq); mpext->data_seq++; mpext->subflow_seq = 1; mpext->dsn64 = 1; mpext->mpc_map = 1; mpext->data_fin = 0; } else { mpext->data_seq = mp_opt.data_seq; mpext->subflow_seq = mp_opt.subflow_seq; mpext->dsn64 = mp_opt.dsn64; mpext->data_fin = mp_opt.data_fin; } mpext->data_len = mp_opt.data_len; mpext->use_map = 1; mpext->csum_reqd = !!(mp_opt.suboptions & OPTION_MPTCP_CSUMREQD); if (mpext->csum_reqd) mpext->csum = mp_opt.csum; } return true; } static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) { const struct sock *ssk = (const struct sock *)tp; struct mptcp_subflow_context *subflow; u64 ack_seq, rcv_wnd_old, rcv_wnd_new; struct mptcp_sock *msk; u32 new_win; u64 win; subflow = mptcp_subflow_ctx(ssk); msk = mptcp_sk(subflow->conn); ack_seq = READ_ONCE(msk->ack_seq); rcv_wnd_new = ack_seq + tp->rcv_wnd; rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent); if (after64(rcv_wnd_new, rcv_wnd_old)) { u64 rcv_wnd; for (;;) { rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new); if (rcv_wnd == rcv_wnd_old) break; rcv_wnd_old = rcv_wnd; if (before64(rcv_wnd_new, rcv_wnd_old)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE); goto raise_win; } MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT); } goto update_wspace; } if (rcv_wnd_new != rcv_wnd_old) { raise_win: win = rcv_wnd_old - ack_seq; tp->rcv_wnd = min_t(u64, win, U32_MAX); new_win = tp->rcv_wnd; /* Make sure we do not exceed the maximum possible * scaled window. */ if (unlikely(th->syn)) new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale; if (!tp->rx_opt.rcv_wscale && READ_ONCE(sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; th->window = htons(new_win); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED); } update_wspace: WRITE_ONCE(msk->old_wspace, tp->rcv_wnd); } __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) { struct csum_pseudo_header header; __wsum csum; /* cfr RFC 8684 3.3.1.: * the data sequence number used in the pseudo-header is * always the 64-bit value, irrespective of what length is used in the * DSS option itself. */ header.data_seq = cpu_to_be64(data_seq); header.subflow_seq = htonl(subflow_seq); header.data_len = htons(data_len); header.csum = 0; csum = csum_partial(&header, sizeof(header), sum); return csum_fold(csum); } static __sum16 mptcp_make_csum(const struct mptcp_ext *mpext) { return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len, ~csum_unfold(mpext->csum)); } static void put_len_csum(u16 len, __sum16 csum, void *data) { __sum16 *sumptr = data + 2; __be16 *ptr = data; put_unaligned_be16(len, ptr); put_unaligned(csum, sumptr); } void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct mptcp_out_options *opts) { const struct sock *ssk = (const struct sock *)tp; struct mptcp_subflow_context *subflow; /* Which options can be used together? * * X: mutually exclusive * O: often used together * C: can be used together in some cases * P: could be used together but we prefer not to (optimisations) * * Opt: | MPC | MPJ | DSS | ADD | RM | PRIO | FAIL | FC | * ------|------|------|------|------|------|------|------|------| * MPC |------|------|------|------|------|------|------|------| * MPJ | X |------|------|------|------|------|------|------| * DSS | X | X |------|------|------|------|------|------| * ADD | X | X | P |------|------|------|------|------| * RM | C | C | C | P |------|------|------|------| * PRIO | X | C | C | C | C |------|------|------| * FAIL | X | X | C | X | X | X |------|------| * FC | X | X | X | X | X | X | X |------| * RST | X | X | X | X | X | X | O | O | * ------|------|------|------|------|------|------|------|------| * * The same applies in mptcp_established_options() function. */ if (likely(OPTION_MPTCP_DSS & opts->suboptions)) { struct mptcp_ext *mpext = &opts->ext_copy; u8 len = TCPOLEN_MPTCP_DSS_BASE; u8 flags = 0; if (mpext->use_ack) { flags = MPTCP_DSS_HAS_ACK; if (mpext->ack64) { len += TCPOLEN_MPTCP_DSS_ACK64; flags |= MPTCP_DSS_ACK64; } else { len += TCPOLEN_MPTCP_DSS_ACK32; } } if (mpext->use_map) { len += TCPOLEN_MPTCP_DSS_MAP64; /* Use only 64-bit mapping flags for now, add * support for optional 32-bit mappings later. */ flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; if (mpext->data_fin) flags |= MPTCP_DSS_DATA_FIN; if (opts->csum_reqd) len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); if (mpext->use_ack) { if (mpext->ack64) { put_unaligned_be64(mpext->data_ack, ptr); ptr += 2; } else { put_unaligned_be32(mpext->data_ack32, ptr); ptr += 1; } } if (mpext->use_map) { put_unaligned_be64(mpext->data_seq, ptr); ptr += 2; put_unaligned_be32(mpext->subflow_seq, ptr); ptr += 1; if (opts->csum_reqd) { /* data_len == 0 is reserved for the infinite mapping, * the checksum will also be set to 0. */ put_len_csum(mpext->data_len, (mpext->data_len ? mptcp_make_csum(mpext) : 0), ptr); } else { put_unaligned_be32(mpext->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } ptr += 1; } /* We might need to add MP_FAIL options in rare cases */ if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) goto mp_fail; } else if (OPTIONS_MPTCP_MPC & opts->suboptions) { u8 len, flag = MPTCP_CAP_HMAC_SHA256; if (OPTION_MPTCP_MPC_SYN & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYN; } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYNACK; } else if (opts->data_len) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) len += TCPOLEN_MPTCP_DSS_CHECKSUM; } else { len = TCPOLEN_MPTCP_MPC_ACK; } if (opts->csum_reqd) flag |= MPTCP_CAP_CHECKSUM_REQD; if (!opts->allow_join_id0) flag |= MPTCP_CAP_DENY_JOIN_ID0; *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, MPTCP_SUPPORTED_VERSION, flag); if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions)) goto mp_capable_done; put_unaligned_be64(opts->sndr_key, ptr); ptr += 2; if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions)) goto mp_capable_done; put_unaligned_be64(opts->rcvr_key, ptr); ptr += 2; if (!opts->data_len) goto mp_capable_done; if (opts->csum_reqd) { put_len_csum(opts->data_len, __mptcp_make_csum(opts->data_seq, opts->subflow_seq, opts->data_len, ~csum_unfold(opts->csum)), ptr); } else { put_unaligned_be32(opts->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } ptr += 1; /* MPC is additionally mutually exclusive with MP_PRIO */ goto mp_capable_done; } else if (OPTIONS_MPTCP_MPJ & opts->suboptions) { if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_SYN, opts->backup, opts->join_id); put_unaligned_be32(opts->token, ptr); ptr += 1; put_unaligned_be32(opts->nonce, ptr); ptr += 1; } else if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_SYNACK, opts->backup, opts->join_id); put_unaligned_be64(opts->thmac, ptr); ptr += 2; put_unaligned_be32(opts->nonce, ptr); ptr += 1; } else { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_ACK, 0, 0); memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); ptr += 5; } } else if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; u8 echo = MPTCP_ADDR_ECHO; #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (opts->addr.family == AF_INET6) len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; #endif if (opts->addr.port) len += TCPOLEN_MPTCP_PORT_LEN; if (opts->ahmac) { len += sizeof(opts->ahmac); echo = 0; } *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, len, echo, opts->addr.id); if (opts->addr.family == AF_INET) { memcpy((u8 *)ptr, (u8 *)&opts->addr.addr.s_addr, 4); ptr += 1; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opts->addr.family == AF_INET6) { memcpy((u8 *)ptr, opts->addr.addr6.s6_addr, 16); ptr += 4; } #endif if (!opts->addr.port) { if (opts->ahmac) { put_unaligned_be64(opts->ahmac, ptr); ptr += 2; } } else { u16 port = ntohs(opts->addr.port); if (opts->ahmac) { u8 *bptr = (u8 *)ptr; put_unaligned_be16(port, bptr); bptr += 2; put_unaligned_be64(opts->ahmac, bptr); bptr += 8; put_unaligned_be16(TCPOPT_NOP << 8 | TCPOPT_NOP, bptr); ptr += 3; } else { put_unaligned_be32(port << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); ptr += 1; } } } else if (unlikely(OPTION_MPTCP_FASTCLOSE & opts->suboptions)) { /* FASTCLOSE is mutually exclusive with others except RST */ *ptr++ = mptcp_option(MPTCPOPT_MP_FASTCLOSE, TCPOLEN_MPTCP_FASTCLOSE, 0, 0); put_unaligned_be64(opts->rcvr_key, ptr); ptr += 2; if (OPTION_MPTCP_RST & opts->suboptions) goto mp_rst; return; } else if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) { mp_fail: /* MP_FAIL is mutually exclusive with others except RST */ subflow = mptcp_subflow_ctx(ssk); subflow->send_mp_fail = 0; *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL, TCPOLEN_MPTCP_FAIL, 0, 0); put_unaligned_be64(opts->fail_seq, ptr); ptr += 2; if (OPTION_MPTCP_RST & opts->suboptions) goto mp_rst; return; } else if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) { mp_rst: *ptr++ = mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, opts->reset_transient, opts->reset_reason); return; } if (OPTION_MPTCP_PRIO & opts->suboptions) { subflow = mptcp_subflow_ctx(ssk); subflow->send_mp_prio = 0; *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO, TCPOLEN_MPTCP_PRIO, opts->backup, TCPOPT_NOP); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPPRIOTX); } mp_capable_done: if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { u8 i = 1; *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr, 0, opts->rm_list.ids[0]); while (i < opts->rm_list.nr) { u8 id1, id2, id3, id4; id1 = opts->rm_list.ids[i]; id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP; id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP; id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP; put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr); ptr += 1; i += 4; } } if (tp) mptcp_set_rwin(tp, th); } __be32 mptcp_get_reset_option(const struct sk_buff *skb) { const struct mptcp_ext *ext = mptcp_get_ext(skb); u8 flags, reason; if (ext) { flags = ext->reset_transient; reason = ext->reset_reason; return mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, flags, reason); } return htonl(0u); } EXPORT_SYMBOL_GPL(mptcp_get_reset_option); |
| 79 79 79 7 73 10 84 39 39 39 29 11 33 6 39 9 36 85 2 83 3 79 79 10 73 79 5 74 74 1 4 4 19 1 2 2 2 10 3 13 3 1 1 2 2 1 7 112 3 39 1548 1531 115 77 1535 45 290 37 1547 1488 255 78 13 27 8 4 8 2 73 7 48 6 2 8 48 8 48 50 1 6 6 6 6 5 2 3 2 36 1 37 4 21 13 1 1 1 1 3 1 1 19 1 2 104 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET 802.1Q VLAN * Ethernet-type device handling. * * Authors: Ben Greear <greearb@candelatech.com> * Please send support related email to: netdev@vger.kernel.org * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: * Fix for packet capture - Nick Eggleston <nick@dccinc.com>; * Add HW acceleration hooks - David S. Miller <davem@redhat.com>; * Correct all the locking - David S. Miller <davem@redhat.com>; * Use hash table for VLAN groups - David S. Miller <davem@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> #include <net/p8022.h> #include <net/arp.h> #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/uaccess.h> #include <linux/if_vlan.h> #include "vlan.h" #include "vlanproc.h" #define DRV_VERSION "1.8" /* Global VLAN variables */ unsigned int vlan_net_id __read_mostly; const char vlan_fullname[] = "802.1Q VLAN Support"; const char vlan_version[] = DRV_VERSION; /* End of global variables definitions. */ static int vlan_group_prealloc_vid(struct vlan_group *vg, __be16 vlan_proto, u16 vlan_id) { struct net_device **array; unsigned int vidx; unsigned int size; int pidx; ASSERT_RTNL(); pidx = vlan_proto_idx(vlan_proto); if (pidx < 0) return -EINVAL; vidx = vlan_id / VLAN_GROUP_ARRAY_PART_LEN; array = vg->vlan_devices_arrays[pidx][vidx]; if (array != NULL) return 0; size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; array = kzalloc(size, GFP_KERNEL_ACCOUNT); if (array == NULL) return -ENOBUFS; /* paired with smp_rmb() in __vlan_group_get_device() */ smp_wmb(); vg->vlan_devices_arrays[pidx][vidx] = array; return 0; } static void vlan_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev, struct vlan_dev_priv *vlan) { if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_stacked_transfer_operstate(rootdev, dev); } void unregister_vlan_dev(struct net_device *dev, struct list_head *head) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; struct vlan_info *vlan_info; struct vlan_group *grp; u16 vlan_id = vlan->vlan_id; ASSERT_RTNL(); vlan_info = rtnl_dereference(real_dev->vlan_info); BUG_ON(!vlan_info); grp = &vlan_info->grp; grp->nr_vlan_devs--; if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_leave(dev); if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_leave(dev); vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, NULL); netdev_upper_dev_unlink(real_dev, dev); /* Because unregister_netdevice_queue() makes sure at least one rcu * grace period is respected before device freeing, * we dont need to call synchronize_net() here. */ unregister_netdevice_queue(dev, head); if (grp->nr_vlan_devs == 0) { vlan_mvrp_uninit_applicant(real_dev); vlan_gvrp_uninit_applicant(real_dev); } vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); } int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id, struct netlink_ext_ack *extack) { const char *name = real_dev->name; if (real_dev->features & NETIF_F_VLAN_CHALLENGED) { pr_info("VLANs not supported on %s\n", name); NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device"); return -EOPNOTSUPP; } if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) { NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists"); return -EEXIST; } return 0; } int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; u16 vlan_id = vlan->vlan_id; struct vlan_info *vlan_info; struct vlan_group *grp; int err; err = vlan_vid_add(real_dev, vlan->vlan_proto, vlan_id); if (err) return err; vlan_info = rtnl_dereference(real_dev->vlan_info); /* vlan_info should be there now. vlan_vid_add took care of it */ BUG_ON(!vlan_info); grp = &vlan_info->grp; if (grp->nr_vlan_devs == 0) { err = vlan_gvrp_init_applicant(real_dev); if (err < 0) goto out_vid_del; err = vlan_mvrp_init_applicant(real_dev); if (err < 0) goto out_uninit_gvrp; } err = vlan_group_prealloc_vid(grp, vlan->vlan_proto, vlan_id); if (err < 0) goto out_uninit_mvrp; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; err = netdev_upper_dev_link(real_dev, dev, extack); if (err) goto out_unregister_netdev; vlan_stacked_transfer_operstate(real_dev, dev, vlan); linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */ /* So, got the sucker initialized, now lets place * it into our local structure. */ vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev); grp->nr_vlan_devs++; return 0; out_unregister_netdev: unregister_netdevice(dev); out_uninit_mvrp: if (grp->nr_vlan_devs == 0) vlan_mvrp_uninit_applicant(real_dev); out_uninit_gvrp: if (grp->nr_vlan_devs == 0) vlan_gvrp_uninit_applicant(real_dev); out_vid_del: vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); return err; } /* Attach a VLAN device to a mac address (ie Ethernet Card). * Returns 0 if the device was created or a negative error code otherwise. */ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) { struct net_device *new_dev; struct vlan_dev_priv *vlan; struct net *net = dev_net(real_dev); struct vlan_net *vn = net_generic(net, vlan_net_id); char name[IFNAMSIZ]; int err; if (vlan_id >= VLAN_VID_MASK) return -ERANGE; err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id, NULL); if (err < 0) return err; /* Gotta set up the fields for the device. */ switch (vn->name_type) { case VLAN_NAME_TYPE_RAW_PLUS_VID: /* name will look like: eth1.0005 */ snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: vlan5 */ snprintf(name, IFNAMSIZ, "vlan%i", vlan_id); break; case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: eth0.5 */ snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID: /* Put our vlan.VID in the name. * Name will look like: vlan0005 */ default: snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id); } new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, NET_NAME_UNKNOWN, vlan_setup); if (new_dev == NULL) return -ENOBUFS; dev_net_set(new_dev, net); /* need 4 bytes for extra VLAN header info, * hope the underlying device can handle it. */ new_dev->mtu = real_dev->mtu; vlan = vlan_dev_priv(new_dev); vlan->vlan_proto = htons(ETH_P_8021Q); vlan->vlan_id = vlan_id; vlan->real_dev = real_dev; vlan->dent = NULL; vlan->flags = VLAN_FLAG_REORDER_HDR; new_dev->rtnl_link_ops = &vlan_link_ops; err = register_vlan_dev(new_dev, NULL); if (err < 0) goto out_free_newdev; return 0; out_free_newdev: free_netdev(new_dev); return err; } static void vlan_sync_address(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); /* May be called without an actual change */ if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) return; /* vlan continues to inherit address of lower device */ if (vlan_dev_inherit_address(vlandev, dev)) goto out; /* vlan address was different from the old address and is equal to * the new address */ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_del(dev, vlandev->dev_addr); /* vlan address was equal to the old address and is different from * the new address */ if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_add(dev, vlandev->dev_addr); out: ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } static void vlan_transfer_features(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); netif_inherit_tso_max(vlandev, dev); if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) vlandev->hard_header_len = dev->hard_header_len; else vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN; #if IS_ENABLED(CONFIG_FCOE) vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid; #endif vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE; vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev); netdev_update_features(vlandev); } static int __vlan_device_event(struct net_device *dev, unsigned long event) { int err = 0; switch (event) { case NETDEV_CHANGENAME: vlan_proc_rem_dev(dev); err = vlan_proc_add_dev(dev); break; case NETDEV_REGISTER: err = vlan_proc_add_dev(dev); break; case NETDEV_UNREGISTER: vlan_proc_rem_dev(dev); break; } return err; } static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vlan_group *grp; struct vlan_info *vlan_info; int i, flgs; struct net_device *vlandev; struct vlan_dev_priv *vlan; bool last = false; LIST_HEAD(list); int err; if (is_vlan_dev(dev)) { int err = __vlan_device_event(dev, event); if (err) return notifier_from_errno(err); } if ((event == NETDEV_UP) && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) { pr_info("adding VLAN 0 to HW filter on device %s\n", dev->name); vlan_vid_add(dev, htons(ETH_P_8021Q), 0); } if (event == NETDEV_DOWN && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) vlan_vid_del(dev, htons(ETH_P_8021Q), 0); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) goto out; grp = &vlan_info->grp; /* It is OK that we do not hold the group lock right now, * as we run under the RTNL lock. */ switch (event) { case NETDEV_CHANGE: /* Propagate real device state to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); break; case NETDEV_CHANGEADDR: /* Adjust unicast filters on underlying device */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan_sync_address(dev, vlandev); } break; case NETDEV_CHANGEMTU: vlan_group_for_each_dev(grp, i, vlandev) { if (vlandev->mtu <= dev->mtu) continue; dev_set_mtu(vlandev, dev->mtu); } break; case NETDEV_FEAT_CHANGE: /* Propagate device features to underlying device */ vlan_group_for_each_dev(grp, i, vlandev) vlan_transfer_features(dev, vlandev); break; case NETDEV_DOWN: { struct net_device *tmp; LIST_HEAD(close_list); /* Put all VLANs for this dev in the down state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) list_add(&vlandev->close_list, &close_list); } dev_close_many(&close_list, false); list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) { vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); list_del_init(&vlandev->close_list); } list_del(&close_list); break; } case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = dev_get_flags(vlandev); if (flgs & IFF_UP) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) dev_change_flags(vlandev, flgs | IFF_UP, extack); vlan_stacked_transfer_operstate(dev, vlandev, vlan); } break; case NETDEV_UNREGISTER: /* twiddle thumbs on netns device moves */ if (dev->reg_state != NETREG_UNREGISTERING) break; vlan_group_for_each_dev(grp, i, vlandev) { /* removal of last vid destroys vlan_info, abort * afterwards */ if (vlan_info->nr_vids == 1) last = true; unregister_vlan_dev(vlandev, &list); if (last) break; } unregister_netdevice_many(&list); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlaying device to change its type. */ if (vlan_uses_dev(dev)) return NOTIFY_BAD; break; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: /* Propagate to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) call_netdevice_notifiers(event, vlandev); break; case NETDEV_CVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021Q)); if (err) return notifier_from_errno(err); break; case NETDEV_CVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021Q)); break; case NETDEV_SVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021AD)); if (err) return notifier_from_errno(err); break; case NETDEV_SVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021AD)); break; } out: return NOTIFY_DONE; } static struct notifier_block vlan_notifier_block __read_mostly = { .notifier_call = vlan_device_event, }; /* * VLAN IOCTL handler. * o execute requested action or pass command to the device driver * arg is really a struct vlan_ioctl_args __user *. */ static int vlan_ioctl_handler(struct net *net, void __user *arg) { int err; struct vlan_ioctl_args args; struct net_device *dev = NULL; if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args))) return -EFAULT; /* Null terminate this sucker, just in case. */ args.device1[sizeof(args.device1) - 1] = 0; args.u.device2[sizeof(args.u.device2) - 1] = 0; rtnl_lock(); switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: case SET_VLAN_EGRESS_PRIORITY_CMD: case SET_VLAN_FLAG_CMD: case ADD_VLAN_CMD: case DEL_VLAN_CMD: case GET_VLAN_REALDEV_NAME_CMD: case GET_VLAN_VID_CMD: err = -ENODEV; dev = __dev_get_by_name(net, args.device1); if (!dev) goto out; err = -EINVAL; if (args.cmd != ADD_VLAN_CMD && !is_vlan_dev(dev)) goto out; } switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; vlan_dev_set_ingress_priority(dev, args.u.skb_priority, args.vlan_qos); err = 0; break; case SET_VLAN_EGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_set_egress_priority(dev, args.u.skb_priority, args.vlan_qos); break; case SET_VLAN_FLAG_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_change_flags(dev, args.vlan_qos ? args.u.flag : 0, args.u.flag); break; case SET_VLAN_NAME_TYPE_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (args.u.name_type < VLAN_NAME_TYPE_HIGHEST) { struct vlan_net *vn; vn = net_generic(net, vlan_net_id); vn->name_type = args.u.name_type; err = 0; } else { err = -EINVAL; } break; case ADD_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = register_vlan_device(dev, args.u.VID); break; case DEL_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; unregister_vlan_dev(dev, NULL); err = 0; break; case GET_VLAN_REALDEV_NAME_CMD: err = 0; vlan_dev_get_realdev_name(dev, args.u.device2, sizeof(args.u.device2)); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; case GET_VLAN_VID_CMD: err = 0; args.u.VID = vlan_dev_vlan_id(dev); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; default: err = -EOPNOTSUPP; break; } out: rtnl_unlock(); return err; } static int __net_init vlan_init_net(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); int err; vn->name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD; err = vlan_proc_init(net); return err; } static void __net_exit vlan_exit_net(struct net *net) { vlan_proc_cleanup(net); } static struct pernet_operations vlan_net_ops = { .init = vlan_init_net, .exit = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct vlan_net), }; static int __init vlan_proto_init(void) { int err; pr_info("%s v%s\n", vlan_fullname, vlan_version); err = register_pernet_subsys(&vlan_net_ops); if (err < 0) goto err0; err = register_netdevice_notifier(&vlan_notifier_block); if (err < 0) goto err2; err = vlan_gvrp_init(); if (err < 0) goto err3; err = vlan_mvrp_init(); if (err < 0) goto err4; err = vlan_netlink_init(); if (err < 0) goto err5; vlan_ioctl_set(vlan_ioctl_handler); return 0; err5: vlan_mvrp_uninit(); err4: vlan_gvrp_uninit(); err3: unregister_netdevice_notifier(&vlan_notifier_block); err2: unregister_pernet_subsys(&vlan_net_ops); err0: return err; } static void __exit vlan_cleanup_module(void) { vlan_ioctl_set(NULL); vlan_netlink_fini(); unregister_netdevice_notifier(&vlan_notifier_block); unregister_pernet_subsys(&vlan_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ vlan_mvrp_uninit(); vlan_gvrp_uninit(); } module_init(vlan_proto_init); module_exit(vlan_cleanup_module); MODULE_DESCRIPTION("802.1Q/802.1ad VLAN Protocol"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); |
| 9 9 3 2 2 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | // SPDX-License-Identifier: GPL-2.0-only /* * STP SAP demux * * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/mutex.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/llc.h> #include <linux/slab.h> #include <linux/module.h> #include <net/llc.h> #include <net/llc_pdu.h> #include <net/stp.h> /* 01:80:c2:00:00:20 - 01:80:c2:00:00:2F */ #define GARP_ADDR_MIN 0x20 #define GARP_ADDR_MAX 0x2F #define GARP_ADDR_RANGE (GARP_ADDR_MAX - GARP_ADDR_MIN) static const struct stp_proto __rcu *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly; static const struct stp_proto __rcu *stp_proto __read_mostly; static struct llc_sap *sap __read_mostly; static unsigned int sap_registered; static DEFINE_MUTEX(stp_proto_mutex); /* Called under rcu_read_lock from LLC */ static int stp_pdu_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct ethhdr *eh = eth_hdr(skb); const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); const struct stp_proto *proto; if (pdu->ssap != LLC_SAP_BSPAN || pdu->dsap != LLC_SAP_BSPAN || pdu->ctrl_1 != LLC_PDU_TYPE_U) goto err; if (eh->h_dest[5] >= GARP_ADDR_MIN && eh->h_dest[5] <= GARP_ADDR_MAX) { proto = rcu_dereference(garp_protos[eh->h_dest[5] - GARP_ADDR_MIN]); if (proto && !ether_addr_equal(eh->h_dest, proto->group_address)) goto err; } else proto = rcu_dereference(stp_proto); if (!proto) goto err; proto->rcv(proto, skb, dev); return 0; err: kfree_skb(skb); return 0; } int stp_proto_register(const struct stp_proto *proto) { int err = 0; mutex_lock(&stp_proto_mutex); if (sap_registered++ == 0) { sap = llc_sap_open(LLC_SAP_BSPAN, stp_pdu_rcv); if (!sap) { err = -ENOMEM; goto out; } } if (is_zero_ether_addr(proto->group_address)) rcu_assign_pointer(stp_proto, proto); else rcu_assign_pointer(garp_protos[proto->group_address[5] - GARP_ADDR_MIN], proto); out: mutex_unlock(&stp_proto_mutex); return err; } EXPORT_SYMBOL_GPL(stp_proto_register); void stp_proto_unregister(const struct stp_proto *proto) { mutex_lock(&stp_proto_mutex); if (is_zero_ether_addr(proto->group_address)) RCU_INIT_POINTER(stp_proto, NULL); else RCU_INIT_POINTER(garp_protos[proto->group_address[5] - GARP_ADDR_MIN], NULL); synchronize_rcu(); if (--sap_registered == 0) llc_sap_put(sap); mutex_unlock(&stp_proto_mutex); } EXPORT_SYMBOL_GPL(stp_proto_unregister); MODULE_DESCRIPTION("SAP demux for IEEE 802.1D Spanning Tree Protocol (STP)"); MODULE_LICENSE("GPL"); |
| 59 2 13 44 3 1 1 53 1 1 1 3 2 1 1 1 1 1 1 1 1 1 1 1 1 7 3 2 1 13 2 12 3 31 9 30 15 33 9 24 3 21 3 21 4 29 8 23 2 9 2 7 30 9 13 23 1 3 20 20 2 8 9 25 32 9 56 4 40 13 53 8 9 9 9 48 4 46 7 6 45 8 29 4 4 5 16 3 22 16 3 6 33 2 4 37 14 54 31 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "netlink.h" #include "device.h" #include "peer.h" #include "socket.h" #include "queueing.h" #include "messages.h" #include <uapi/linux/wireguard.h> #include <linux/if.h> #include <net/genetlink.h> #include <net/sock.h> #include <crypto/utils.h> static struct genl_family genl_family; static const struct nla_policy device_policy[WGDEVICE_A_MAX + 1] = { [WGDEVICE_A_IFINDEX] = { .type = NLA_U32 }, [WGDEVICE_A_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [WGDEVICE_A_PRIVATE_KEY] = NLA_POLICY_EXACT_LEN(NOISE_PUBLIC_KEY_LEN), [WGDEVICE_A_PUBLIC_KEY] = NLA_POLICY_EXACT_LEN(NOISE_PUBLIC_KEY_LEN), [WGDEVICE_A_FLAGS] = { .type = NLA_U32 }, [WGDEVICE_A_LISTEN_PORT] = { .type = NLA_U16 }, [WGDEVICE_A_FWMARK] = { .type = NLA_U32 }, [WGDEVICE_A_PEERS] = { .type = NLA_NESTED } }; static const struct nla_policy peer_policy[WGPEER_A_MAX + 1] = { [WGPEER_A_PUBLIC_KEY] = NLA_POLICY_EXACT_LEN(NOISE_PUBLIC_KEY_LEN), [WGPEER_A_PRESHARED_KEY] = NLA_POLICY_EXACT_LEN(NOISE_SYMMETRIC_KEY_LEN), [WGPEER_A_FLAGS] = { .type = NLA_U32 }, [WGPEER_A_ENDPOINT] = NLA_POLICY_MIN_LEN(sizeof(struct sockaddr)), [WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL] = { .type = NLA_U16 }, [WGPEER_A_LAST_HANDSHAKE_TIME] = NLA_POLICY_EXACT_LEN(sizeof(struct __kernel_timespec)), [WGPEER_A_RX_BYTES] = { .type = NLA_U64 }, [WGPEER_A_TX_BYTES] = { .type = NLA_U64 }, [WGPEER_A_ALLOWEDIPS] = { .type = NLA_NESTED }, [WGPEER_A_PROTOCOL_VERSION] = { .type = NLA_U32 } }; static const struct nla_policy allowedip_policy[WGALLOWEDIP_A_MAX + 1] = { [WGALLOWEDIP_A_FAMILY] = { .type = NLA_U16 }, [WGALLOWEDIP_A_IPADDR] = NLA_POLICY_MIN_LEN(sizeof(struct in_addr)), [WGALLOWEDIP_A_CIDR_MASK] = { .type = NLA_U8 } }; static struct wg_device *lookup_interface(struct nlattr **attrs, struct sk_buff *skb) { struct net_device *dev = NULL; if (!attrs[WGDEVICE_A_IFINDEX] == !attrs[WGDEVICE_A_IFNAME]) return ERR_PTR(-EBADR); if (attrs[WGDEVICE_A_IFINDEX]) dev = dev_get_by_index(sock_net(skb->sk), nla_get_u32(attrs[WGDEVICE_A_IFINDEX])); else if (attrs[WGDEVICE_A_IFNAME]) dev = dev_get_by_name(sock_net(skb->sk), nla_data(attrs[WGDEVICE_A_IFNAME])); if (!dev) return ERR_PTR(-ENODEV); if (!dev->rtnl_link_ops || !dev->rtnl_link_ops->kind || strcmp(dev->rtnl_link_ops->kind, KBUILD_MODNAME)) { dev_put(dev); return ERR_PTR(-EOPNOTSUPP); } return netdev_priv(dev); } static int get_allowedips(struct sk_buff *skb, const u8 *ip, u8 cidr, int family) { struct nlattr *allowedip_nest; allowedip_nest = nla_nest_start(skb, 0); if (!allowedip_nest) return -EMSGSIZE; if (nla_put_u8(skb, WGALLOWEDIP_A_CIDR_MASK, cidr) || nla_put_u16(skb, WGALLOWEDIP_A_FAMILY, family) || nla_put(skb, WGALLOWEDIP_A_IPADDR, family == AF_INET6 ? sizeof(struct in6_addr) : sizeof(struct in_addr), ip)) { nla_nest_cancel(skb, allowedip_nest); return -EMSGSIZE; } nla_nest_end(skb, allowedip_nest); return 0; } struct dump_ctx { struct wg_device *wg; struct wg_peer *next_peer; u64 allowedips_seq; struct allowedips_node *next_allowedip; }; #define DUMP_CTX(cb) ((struct dump_ctx *)(cb)->args) static int get_peer(struct wg_peer *peer, struct sk_buff *skb, struct dump_ctx *ctx) { struct nlattr *allowedips_nest, *peer_nest = nla_nest_start(skb, 0); struct allowedips_node *allowedips_node = ctx->next_allowedip; bool fail; if (!peer_nest) return -EMSGSIZE; down_read(&peer->handshake.lock); fail = nla_put(skb, WGPEER_A_PUBLIC_KEY, NOISE_PUBLIC_KEY_LEN, peer->handshake.remote_static); up_read(&peer->handshake.lock); if (fail) goto err; if (!allowedips_node) { const struct __kernel_timespec last_handshake = { .tv_sec = peer->walltime_last_handshake.tv_sec, .tv_nsec = peer->walltime_last_handshake.tv_nsec }; down_read(&peer->handshake.lock); fail = nla_put(skb, WGPEER_A_PRESHARED_KEY, NOISE_SYMMETRIC_KEY_LEN, peer->handshake.preshared_key); up_read(&peer->handshake.lock); if (fail) goto err; if (nla_put(skb, WGPEER_A_LAST_HANDSHAKE_TIME, sizeof(last_handshake), &last_handshake) || nla_put_u16(skb, WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL, peer->persistent_keepalive_interval) || nla_put_u64_64bit(skb, WGPEER_A_TX_BYTES, peer->tx_bytes, WGPEER_A_UNSPEC) || nla_put_u64_64bit(skb, WGPEER_A_RX_BYTES, peer->rx_bytes, WGPEER_A_UNSPEC) || nla_put_u32(skb, WGPEER_A_PROTOCOL_VERSION, 1)) goto err; read_lock_bh(&peer->endpoint_lock); if (peer->endpoint.addr.sa_family == AF_INET) fail = nla_put(skb, WGPEER_A_ENDPOINT, sizeof(peer->endpoint.addr4), &peer->endpoint.addr4); else if (peer->endpoint.addr.sa_family == AF_INET6) fail = nla_put(skb, WGPEER_A_ENDPOINT, sizeof(peer->endpoint.addr6), &peer->endpoint.addr6); read_unlock_bh(&peer->endpoint_lock); if (fail) goto err; allowedips_node = list_first_entry_or_null(&peer->allowedips_list, struct allowedips_node, peer_list); } if (!allowedips_node) goto no_allowedips; if (!ctx->allowedips_seq) ctx->allowedips_seq = ctx->wg->peer_allowedips.seq; else if (ctx->allowedips_seq != ctx->wg->peer_allowedips.seq) goto no_allowedips; allowedips_nest = nla_nest_start(skb, WGPEER_A_ALLOWEDIPS); if (!allowedips_nest) goto err; list_for_each_entry_from(allowedips_node, &peer->allowedips_list, peer_list) { u8 cidr, ip[16] __aligned(__alignof(u64)); int family; family = wg_allowedips_read_node(allowedips_node, ip, &cidr); if (get_allowedips(skb, ip, cidr, family)) { nla_nest_end(skb, allowedips_nest); nla_nest_end(skb, peer_nest); ctx->next_allowedip = allowedips_node; return -EMSGSIZE; } } nla_nest_end(skb, allowedips_nest); no_allowedips: nla_nest_end(skb, peer_nest); ctx->next_allowedip = NULL; ctx->allowedips_seq = 0; return 0; err: nla_nest_cancel(skb, peer_nest); return -EMSGSIZE; } static int wg_get_device_start(struct netlink_callback *cb) { struct wg_device *wg; wg = lookup_interface(genl_info_dump(cb)->attrs, cb->skb); if (IS_ERR(wg)) return PTR_ERR(wg); DUMP_CTX(cb)->wg = wg; return 0; } static int wg_get_device_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct wg_peer *peer, *next_peer_cursor; struct dump_ctx *ctx = DUMP_CTX(cb); struct wg_device *wg = ctx->wg; struct nlattr *peers_nest; int ret = -EMSGSIZE; bool done = true; void *hdr; rtnl_lock(); mutex_lock(&wg->device_update_lock); cb->seq = wg->device_update_gen; next_peer_cursor = ctx->next_peer; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &genl_family, NLM_F_MULTI, WG_CMD_GET_DEVICE); if (!hdr) goto out; genl_dump_check_consistent(cb, hdr); if (!ctx->next_peer) { if (nla_put_u16(skb, WGDEVICE_A_LISTEN_PORT, wg->incoming_port) || nla_put_u32(skb, WGDEVICE_A_FWMARK, wg->fwmark) || nla_put_u32(skb, WGDEVICE_A_IFINDEX, wg->dev->ifindex) || nla_put_string(skb, WGDEVICE_A_IFNAME, wg->dev->name)) goto out; down_read(&wg->static_identity.lock); if (wg->static_identity.has_identity) { if (nla_put(skb, WGDEVICE_A_PRIVATE_KEY, NOISE_PUBLIC_KEY_LEN, wg->static_identity.static_private) || nla_put(skb, WGDEVICE_A_PUBLIC_KEY, NOISE_PUBLIC_KEY_LEN, wg->static_identity.static_public)) { up_read(&wg->static_identity.lock); goto out; } } up_read(&wg->static_identity.lock); } peers_nest = nla_nest_start(skb, WGDEVICE_A_PEERS); if (!peers_nest) goto out; ret = 0; lockdep_assert_held(&wg->device_update_lock); /* If the last cursor was removed in peer_remove or peer_remove_all, then * we just treat this the same as there being no more peers left. The * reason is that seq_nr should indicate to userspace that this isn't a * coherent dump anyway, so they'll try again. */ if (list_empty(&wg->peer_list) || (ctx->next_peer && ctx->next_peer->is_dead)) { nla_nest_cancel(skb, peers_nest); goto out; } peer = list_prepare_entry(ctx->next_peer, &wg->peer_list, peer_list); list_for_each_entry_continue(peer, &wg->peer_list, peer_list) { if (get_peer(peer, skb, ctx)) { done = false; break; } next_peer_cursor = peer; } nla_nest_end(skb, peers_nest); out: if (!ret && !done && next_peer_cursor) wg_peer_get(next_peer_cursor); wg_peer_put(ctx->next_peer); mutex_unlock(&wg->device_update_lock); rtnl_unlock(); if (ret) { genlmsg_cancel(skb, hdr); return ret; } genlmsg_end(skb, hdr); if (done) { ctx->next_peer = NULL; return 0; } ctx->next_peer = next_peer_cursor; return skb->len; /* At this point, we can't really deal ourselves with safely zeroing out * the private key material after usage. This will need an additional API * in the kernel for marking skbs as zero_on_free. */ } static int wg_get_device_done(struct netlink_callback *cb) { struct dump_ctx *ctx = DUMP_CTX(cb); if (ctx->wg) dev_put(ctx->wg->dev); wg_peer_put(ctx->next_peer); return 0; } static int set_port(struct wg_device *wg, u16 port) { struct wg_peer *peer; if (wg->incoming_port == port) return 0; list_for_each_entry(peer, &wg->peer_list, peer_list) wg_socket_clear_peer_endpoint_src(peer); if (!netif_running(wg->dev)) { wg->incoming_port = port; return 0; } return wg_socket_init(wg, port); } static int set_allowedip(struct wg_peer *peer, struct nlattr **attrs) { int ret = -EINVAL; u16 family; u8 cidr; if (!attrs[WGALLOWEDIP_A_FAMILY] || !attrs[WGALLOWEDIP_A_IPADDR] || !attrs[WGALLOWEDIP_A_CIDR_MASK]) return ret; family = nla_get_u16(attrs[WGALLOWEDIP_A_FAMILY]); cidr = nla_get_u8(attrs[WGALLOWEDIP_A_CIDR_MASK]); if (family == AF_INET && cidr <= 32 && nla_len(attrs[WGALLOWEDIP_A_IPADDR]) == sizeof(struct in_addr)) ret = wg_allowedips_insert_v4( &peer->device->peer_allowedips, nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, peer, &peer->device->device_update_lock); else if (family == AF_INET6 && cidr <= 128 && nla_len(attrs[WGALLOWEDIP_A_IPADDR]) == sizeof(struct in6_addr)) ret = wg_allowedips_insert_v6( &peer->device->peer_allowedips, nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, peer, &peer->device->device_update_lock); return ret; } static int set_peer(struct wg_device *wg, struct nlattr **attrs) { u8 *public_key = NULL, *preshared_key = NULL; struct wg_peer *peer = NULL; u32 flags = 0; int ret; ret = -EINVAL; if (attrs[WGPEER_A_PUBLIC_KEY] && nla_len(attrs[WGPEER_A_PUBLIC_KEY]) == NOISE_PUBLIC_KEY_LEN) public_key = nla_data(attrs[WGPEER_A_PUBLIC_KEY]); else goto out; if (attrs[WGPEER_A_PRESHARED_KEY] && nla_len(attrs[WGPEER_A_PRESHARED_KEY]) == NOISE_SYMMETRIC_KEY_LEN) preshared_key = nla_data(attrs[WGPEER_A_PRESHARED_KEY]); if (attrs[WGPEER_A_FLAGS]) flags = nla_get_u32(attrs[WGPEER_A_FLAGS]); ret = -EOPNOTSUPP; if (flags & ~__WGPEER_F_ALL) goto out; ret = -EPFNOSUPPORT; if (attrs[WGPEER_A_PROTOCOL_VERSION]) { if (nla_get_u32(attrs[WGPEER_A_PROTOCOL_VERSION]) != 1) goto out; } peer = wg_pubkey_hashtable_lookup(wg->peer_hashtable, nla_data(attrs[WGPEER_A_PUBLIC_KEY])); ret = 0; if (!peer) { /* Peer doesn't exist yet. Add a new one. */ if (flags & (WGPEER_F_REMOVE_ME | WGPEER_F_UPDATE_ONLY)) goto out; /* The peer is new, so there aren't allowed IPs to remove. */ flags &= ~WGPEER_F_REPLACE_ALLOWEDIPS; down_read(&wg->static_identity.lock); if (wg->static_identity.has_identity && !memcmp(nla_data(attrs[WGPEER_A_PUBLIC_KEY]), wg->static_identity.static_public, NOISE_PUBLIC_KEY_LEN)) { /* We silently ignore peers that have the same public * key as the device. The reason we do it silently is * that we'd like for people to be able to reuse the * same set of API calls across peers. */ up_read(&wg->static_identity.lock); ret = 0; goto out; } up_read(&wg->static_identity.lock); peer = wg_peer_create(wg, public_key, preshared_key); if (IS_ERR(peer)) { ret = PTR_ERR(peer); peer = NULL; goto out; } /* Take additional reference, as though we've just been * looked up. */ wg_peer_get(peer); } if (flags & WGPEER_F_REMOVE_ME) { wg_peer_remove(peer); goto out; } if (preshared_key) { down_write(&peer->handshake.lock); memcpy(&peer->handshake.preshared_key, preshared_key, NOISE_SYMMETRIC_KEY_LEN); up_write(&peer->handshake.lock); } if (attrs[WGPEER_A_ENDPOINT]) { struct sockaddr *addr = nla_data(attrs[WGPEER_A_ENDPOINT]); size_t len = nla_len(attrs[WGPEER_A_ENDPOINT]); struct endpoint endpoint = { { { 0 } } }; if (len == sizeof(struct sockaddr_in) && addr->sa_family == AF_INET) { endpoint.addr4 = *(struct sockaddr_in *)addr; wg_socket_set_peer_endpoint(peer, &endpoint); } else if (len == sizeof(struct sockaddr_in6) && addr->sa_family == AF_INET6) { endpoint.addr6 = *(struct sockaddr_in6 *)addr; wg_socket_set_peer_endpoint(peer, &endpoint); } } if (flags & WGPEER_F_REPLACE_ALLOWEDIPS) wg_allowedips_remove_by_peer(&wg->peer_allowedips, peer, &wg->device_update_lock); if (attrs[WGPEER_A_ALLOWEDIPS]) { struct nlattr *attr, *allowedip[WGALLOWEDIP_A_MAX + 1]; int rem; nla_for_each_nested(attr, attrs[WGPEER_A_ALLOWEDIPS], rem) { ret = nla_parse_nested(allowedip, WGALLOWEDIP_A_MAX, attr, allowedip_policy, NULL); if (ret < 0) goto out; ret = set_allowedip(peer, allowedip); if (ret < 0) goto out; } } if (attrs[WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL]) { const u16 persistent_keepalive_interval = nla_get_u16( attrs[WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL]); const bool send_keepalive = !peer->persistent_keepalive_interval && persistent_keepalive_interval && netif_running(wg->dev); peer->persistent_keepalive_interval = persistent_keepalive_interval; if (send_keepalive) wg_packet_send_keepalive(peer); } if (netif_running(wg->dev)) wg_packet_send_staged_packets(peer); out: wg_peer_put(peer); if (attrs[WGPEER_A_PRESHARED_KEY]) memzero_explicit(nla_data(attrs[WGPEER_A_PRESHARED_KEY]), nla_len(attrs[WGPEER_A_PRESHARED_KEY])); return ret; } static int wg_set_device(struct sk_buff *skb, struct genl_info *info) { struct wg_device *wg = lookup_interface(info->attrs, skb); u32 flags = 0; int ret; if (IS_ERR(wg)) { ret = PTR_ERR(wg); goto out_nodev; } rtnl_lock(); mutex_lock(&wg->device_update_lock); if (info->attrs[WGDEVICE_A_FLAGS]) flags = nla_get_u32(info->attrs[WGDEVICE_A_FLAGS]); ret = -EOPNOTSUPP; if (flags & ~__WGDEVICE_F_ALL) goto out; if (info->attrs[WGDEVICE_A_LISTEN_PORT] || info->attrs[WGDEVICE_A_FWMARK]) { struct net *net; rcu_read_lock(); net = rcu_dereference(wg->creating_net); ret = !net || !ns_capable(net->user_ns, CAP_NET_ADMIN) ? -EPERM : 0; rcu_read_unlock(); if (ret) goto out; } ++wg->device_update_gen; if (info->attrs[WGDEVICE_A_FWMARK]) { struct wg_peer *peer; wg->fwmark = nla_get_u32(info->attrs[WGDEVICE_A_FWMARK]); list_for_each_entry(peer, &wg->peer_list, peer_list) wg_socket_clear_peer_endpoint_src(peer); } if (info->attrs[WGDEVICE_A_LISTEN_PORT]) { ret = set_port(wg, nla_get_u16(info->attrs[WGDEVICE_A_LISTEN_PORT])); if (ret) goto out; } if (flags & WGDEVICE_F_REPLACE_PEERS) wg_peer_remove_all(wg); if (info->attrs[WGDEVICE_A_PRIVATE_KEY] && nla_len(info->attrs[WGDEVICE_A_PRIVATE_KEY]) == NOISE_PUBLIC_KEY_LEN) { u8 *private_key = nla_data(info->attrs[WGDEVICE_A_PRIVATE_KEY]); u8 public_key[NOISE_PUBLIC_KEY_LEN]; struct wg_peer *peer, *temp; bool send_staged_packets; if (!crypto_memneq(wg->static_identity.static_private, private_key, NOISE_PUBLIC_KEY_LEN)) goto skip_set_private_key; /* We remove before setting, to prevent race, which means doing * two 25519-genpub ops. */ if (curve25519_generate_public(public_key, private_key)) { peer = wg_pubkey_hashtable_lookup(wg->peer_hashtable, public_key); if (peer) { wg_peer_put(peer); wg_peer_remove(peer); } } down_write(&wg->static_identity.lock); send_staged_packets = !wg->static_identity.has_identity && netif_running(wg->dev); wg_noise_set_static_identity_private_key(&wg->static_identity, private_key); send_staged_packets = send_staged_packets && wg->static_identity.has_identity; wg_cookie_checker_precompute_device_keys(&wg->cookie_checker); list_for_each_entry_safe(peer, temp, &wg->peer_list, peer_list) { wg_noise_precompute_static_static(peer); wg_noise_expire_current_peer_keypairs(peer); if (send_staged_packets) wg_packet_send_staged_packets(peer); } up_write(&wg->static_identity.lock); } skip_set_private_key: if (info->attrs[WGDEVICE_A_PEERS]) { struct nlattr *attr, *peer[WGPEER_A_MAX + 1]; int rem; nla_for_each_nested(attr, info->attrs[WGDEVICE_A_PEERS], rem) { ret = nla_parse_nested(peer, WGPEER_A_MAX, attr, peer_policy, NULL); if (ret < 0) goto out; ret = set_peer(wg, peer); if (ret < 0) goto out; } } ret = 0; out: mutex_unlock(&wg->device_update_lock); rtnl_unlock(); dev_put(wg->dev); out_nodev: if (info->attrs[WGDEVICE_A_PRIVATE_KEY]) memzero_explicit(nla_data(info->attrs[WGDEVICE_A_PRIVATE_KEY]), nla_len(info->attrs[WGDEVICE_A_PRIVATE_KEY])); return ret; } static const struct genl_ops genl_ops[] = { { .cmd = WG_CMD_GET_DEVICE, .start = wg_get_device_start, .dumpit = wg_get_device_dump, .done = wg_get_device_done, .flags = GENL_UNS_ADMIN_PERM }, { .cmd = WG_CMD_SET_DEVICE, .doit = wg_set_device, .flags = GENL_UNS_ADMIN_PERM } }; static struct genl_family genl_family __ro_after_init = { .ops = genl_ops, .n_ops = ARRAY_SIZE(genl_ops), .resv_start_op = WG_CMD_SET_DEVICE + 1, .name = WG_GENL_NAME, .version = WG_GENL_VERSION, .maxattr = WGDEVICE_A_MAX, .module = THIS_MODULE, .policy = device_policy, .netnsok = true }; int __init wg_genetlink_init(void) { return genl_register_family(&genl_family); } void __exit wg_genetlink_uninit(void) { genl_unregister_family(&genl_family); } |
| 16 6 5 3 2 1 1 10 6 10 21 1 17 5 11 11 4 7 29 1 1 1 1 3 22 2 19 11 9 2 1 1 3 1 1 1 5 1 4 4 4 5 2 4 6 11 5 6 11 11 2 1 1 1 1 19 1 1 1 1 1 1 1 1 11 10 2 10 2 10 2 11 2 11 2 12 11 1 13 5 13 8 2 6 7 4 3 3 3 2 1 1 1 1 12 6 6 12 12 12 11 12 6 5 1 2 1 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 | // SPDX-License-Identifier: GPL-2.0-only /* L2TP netlink layer, for management * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd * * Partly based on the IrDA nelink implementation * (see net/irda/irnetlink.c) which is: * Copyright (c) 2007 Samuel Ortiz <samuel@sortiz.org> * which is in turn partly based on the wireless netlink code: * Copyright 2006 Johannes Berg <johannes@sipsolutions.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <net/sock.h> #include <net/genetlink.h> #include <net/udp.h> #include <linux/in.h> #include <linux/udp.h> #include <linux/socket.h> #include <linux/module.h> #include <linux/list.h> #include <net/net_namespace.h> #include <linux/l2tp.h> #include "l2tp_core.h" static struct genl_family l2tp_nl_family; static const struct genl_multicast_group l2tp_multicast_group[] = { { .name = L2TP_GENL_MCGROUP, }, }; static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int flags, struct l2tp_tunnel *tunnel, u8 cmd); static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int flags, struct l2tp_session *session, u8 cmd); /* Accessed under genl lock */ static const struct l2tp_nl_cmd_ops *l2tp_nl_cmd_ops[__L2TP_PWTYPE_MAX]; static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info) { u32 tunnel_id; u32 session_id; char *ifname; struct l2tp_tunnel *tunnel; struct l2tp_session *session = NULL; struct net *net = genl_info_net(info); if (info->attrs[L2TP_ATTR_IFNAME]) { ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); session = l2tp_session_get_by_ifname(net, ifname); } else if ((info->attrs[L2TP_ATTR_SESSION_ID]) && (info->attrs[L2TP_ATTR_CONN_ID])) { tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); tunnel = l2tp_tunnel_get(net, tunnel_id); if (tunnel) { session = l2tp_session_get(net, tunnel->sock, tunnel->version, tunnel_id, session_id); l2tp_tunnel_put(tunnel); } } return session; } static int l2tp_nl_cmd_noop(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; void *hdr; int ret = -ENOBUFS; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto out; } hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &l2tp_nl_family, 0, L2TP_CMD_NOOP); if (!hdr) { ret = -EMSGSIZE; goto err_out; } genlmsg_end(msg, hdr); return genlmsg_unicast(genl_info_net(info), msg, info->snd_portid); err_out: nlmsg_free(msg); out: return ret; } static int l2tp_tunnel_notify(struct genl_family *family, struct genl_info *info, struct l2tp_tunnel *tunnel, u8 cmd) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, tunnel, cmd); if (ret >= 0) { ret = genlmsg_multicast_allns(family, msg, 0, 0); /* We don't care if no one is listening */ if (ret == -ESRCH) ret = 0; return ret; } nlmsg_free(msg); return ret; } static int l2tp_session_notify(struct genl_family *family, struct genl_info *info, struct l2tp_session *session, u8 cmd) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, session, cmd); if (ret >= 0) { ret = genlmsg_multicast_allns(family, msg, 0, 0); /* We don't care if no one is listening */ if (ret == -ESRCH) ret = 0; return ret; } nlmsg_free(msg); return ret; } static int l2tp_nl_cmd_tunnel_create_get_addr(struct nlattr **attrs, struct l2tp_tunnel_cfg *cfg) { if (attrs[L2TP_ATTR_UDP_SPORT]) cfg->local_udp_port = nla_get_u16(attrs[L2TP_ATTR_UDP_SPORT]); if (attrs[L2TP_ATTR_UDP_DPORT]) cfg->peer_udp_port = nla_get_u16(attrs[L2TP_ATTR_UDP_DPORT]); cfg->use_udp_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_CSUM]); /* Must have either AF_INET or AF_INET6 address for source and destination */ #if IS_ENABLED(CONFIG_IPV6) if (attrs[L2TP_ATTR_IP6_SADDR] && attrs[L2TP_ATTR_IP6_DADDR]) { cfg->local_ip6 = nla_data(attrs[L2TP_ATTR_IP6_SADDR]); cfg->peer_ip6 = nla_data(attrs[L2TP_ATTR_IP6_DADDR]); cfg->udp6_zero_tx_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]); cfg->udp6_zero_rx_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]); return 0; } #endif if (attrs[L2TP_ATTR_IP_SADDR] && attrs[L2TP_ATTR_IP_DADDR]) { cfg->local_ip.s_addr = nla_get_in_addr(attrs[L2TP_ATTR_IP_SADDR]); cfg->peer_ip.s_addr = nla_get_in_addr(attrs[L2TP_ATTR_IP_DADDR]); return 0; } return -EINVAL; } static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info) { u32 tunnel_id; u32 peer_tunnel_id; int proto_version; int fd = -1; int ret = 0; struct l2tp_tunnel_cfg cfg = { 0, }; struct l2tp_tunnel *tunnel; struct net *net = genl_info_net(info); struct nlattr **attrs = info->attrs; if (!attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto out; } tunnel_id = nla_get_u32(attrs[L2TP_ATTR_CONN_ID]); if (!attrs[L2TP_ATTR_PEER_CONN_ID]) { ret = -EINVAL; goto out; } peer_tunnel_id = nla_get_u32(attrs[L2TP_ATTR_PEER_CONN_ID]); if (!attrs[L2TP_ATTR_PROTO_VERSION]) { ret = -EINVAL; goto out; } proto_version = nla_get_u8(attrs[L2TP_ATTR_PROTO_VERSION]); if (!attrs[L2TP_ATTR_ENCAP_TYPE]) { ret = -EINVAL; goto out; } cfg.encap = nla_get_u16(attrs[L2TP_ATTR_ENCAP_TYPE]); /* Managed tunnels take the tunnel socket from userspace. * Unmanaged tunnels must call out the source and destination addresses * for the kernel to create the tunnel socket itself. */ if (attrs[L2TP_ATTR_FD]) { fd = nla_get_u32(attrs[L2TP_ATTR_FD]); } else { ret = l2tp_nl_cmd_tunnel_create_get_addr(attrs, &cfg); if (ret < 0) goto out; } ret = -EINVAL; switch (cfg.encap) { case L2TP_ENCAPTYPE_UDP: case L2TP_ENCAPTYPE_IP: ret = l2tp_tunnel_create(fd, proto_version, tunnel_id, peer_tunnel_id, &cfg, &tunnel); break; } if (ret < 0) goto out; refcount_inc(&tunnel->ref_count); ret = l2tp_tunnel_register(tunnel, net, &cfg); if (ret < 0) { kfree(tunnel); goto out; } ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_CREATE); l2tp_tunnel_put(tunnel); out: return ret; } static int l2tp_nl_cmd_tunnel_delete(struct sk_buff *skb, struct genl_info *info) { struct l2tp_tunnel *tunnel; u32 tunnel_id; int ret = 0; struct net *net = genl_info_net(info); if (!info->attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto out; } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); tunnel = l2tp_tunnel_get(net, tunnel_id); if (!tunnel) { ret = -ENODEV; goto out; } l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_DELETE); l2tp_tunnel_delete(tunnel); l2tp_tunnel_put(tunnel); out: return ret; } static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info) { struct l2tp_tunnel *tunnel; u32 tunnel_id; int ret = 0; struct net *net = genl_info_net(info); if (!info->attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto out; } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); tunnel = l2tp_tunnel_get(net, tunnel_id); if (!tunnel) { ret = -ENODEV; goto out; } ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_MODIFY); l2tp_tunnel_put(tunnel); out: return ret; } #if IS_ENABLED(CONFIG_IPV6) static int l2tp_nl_tunnel_send_addr6(struct sk_buff *skb, struct sock *sk, enum l2tp_encap_type encap) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); switch (encap) { case L2TP_ENCAPTYPE_UDP: if (udp_get_no_check6_tx(sk) && nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_TX)) return -1; if (udp_get_no_check6_rx(sk) && nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_RX)) return -1; if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) || nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport))) return -1; fallthrough; case L2TP_ENCAPTYPE_IP: if (nla_put_in6_addr(skb, L2TP_ATTR_IP6_SADDR, &np->saddr) || nla_put_in6_addr(skb, L2TP_ATTR_IP6_DADDR, &sk->sk_v6_daddr)) return -1; break; } return 0; } #endif static int l2tp_nl_tunnel_send_addr4(struct sk_buff *skb, struct sock *sk, enum l2tp_encap_type encap) { struct inet_sock *inet = inet_sk(sk); switch (encap) { case L2TP_ENCAPTYPE_UDP: if (nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx) || nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) || nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport))) return -1; fallthrough; case L2TP_ENCAPTYPE_IP: if (nla_put_in_addr(skb, L2TP_ATTR_IP_SADDR, inet->inet_saddr) || nla_put_in_addr(skb, L2TP_ATTR_IP_DADDR, inet->inet_daddr)) return -1; break; } return 0; } /* Append attributes for the tunnel address, handling the different attribute types * used for different tunnel encapsulation and AF_INET v.s. AF_INET6. */ static int l2tp_nl_tunnel_send_addr(struct sk_buff *skb, struct l2tp_tunnel *tunnel) { struct sock *sk = tunnel->sock; if (!sk) return 0; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return l2tp_nl_tunnel_send_addr6(skb, sk, tunnel->encap); #endif return l2tp_nl_tunnel_send_addr4(skb, sk, tunnel->encap); } static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int flags, struct l2tp_tunnel *tunnel, u8 cmd) { void *hdr; struct nlattr *nest; hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (nla_put_u8(skb, L2TP_ATTR_PROTO_VERSION, tunnel->version) || nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) || nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) || nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) || nla_put_u16(skb, L2TP_ATTR_ENCAP_TYPE, tunnel->encap)) goto nla_put_failure; nest = nla_nest_start_noflag(skb, L2TP_ATTR_STATS); if (!nest) goto nla_put_failure; if (nla_put_u64_64bit(skb, L2TP_ATTR_TX_PACKETS, atomic_long_read(&tunnel->stats.tx_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_TX_BYTES, atomic_long_read(&tunnel->stats.tx_bytes), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_TX_ERRORS, atomic_long_read(&tunnel->stats.tx_errors), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_PACKETS, atomic_long_read(&tunnel->stats.rx_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_BYTES, atomic_long_read(&tunnel->stats.rx_bytes), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS, atomic_long_read(&tunnel->stats.rx_seq_discards), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_COOKIE_DISCARDS, atomic_long_read(&tunnel->stats.rx_cookie_discards), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS, atomic_long_read(&tunnel->stats.rx_oos_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_ERRORS, atomic_long_read(&tunnel->stats.rx_errors), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_INVALID, atomic_long_read(&tunnel->stats.rx_invalid), L2TP_ATTR_STATS_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); if (l2tp_nl_tunnel_send_addr(skb, tunnel)) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -1; } static int l2tp_nl_cmd_tunnel_get(struct sk_buff *skb, struct genl_info *info) { struct l2tp_tunnel *tunnel; struct sk_buff *msg; u32 tunnel_id; int ret = -ENOBUFS; struct net *net = genl_info_net(info); if (!info->attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto err; } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err; } tunnel = l2tp_tunnel_get(net, tunnel_id); if (!tunnel) { ret = -ENODEV; goto err_nlmsg; } ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, tunnel, L2TP_CMD_TUNNEL_GET); if (ret < 0) goto err_nlmsg_tunnel; l2tp_tunnel_put(tunnel); return genlmsg_unicast(net, msg, info->snd_portid); err_nlmsg_tunnel: l2tp_tunnel_put(tunnel); err_nlmsg: nlmsg_free(msg); err: return ret; } struct l2tp_nl_cb_data { unsigned long tkey; unsigned long skey; }; static int l2tp_nl_cmd_tunnel_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct l2tp_nl_cb_data *cbd = (void *)&cb->ctx[0]; unsigned long key = cbd->tkey; struct l2tp_tunnel *tunnel; struct net *net = sock_net(skb->sk); for (;;) { tunnel = l2tp_tunnel_get_next(net, &key); if (!tunnel) goto out; if (l2tp_nl_tunnel_send(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, tunnel, L2TP_CMD_TUNNEL_GET) < 0) { l2tp_tunnel_put(tunnel); goto out; } l2tp_tunnel_put(tunnel); key++; } out: cbd->tkey = key; return skb->len; } static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *info) { u32 tunnel_id = 0; u32 session_id; u32 peer_session_id; int ret = 0; struct l2tp_tunnel *tunnel; struct l2tp_session *session; struct l2tp_session_cfg cfg = { 0, }; struct net *net = genl_info_net(info); if (!info->attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; goto out; } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); tunnel = l2tp_tunnel_get(net, tunnel_id); if (!tunnel) { ret = -ENODEV; goto out; } if (!info->attrs[L2TP_ATTR_SESSION_ID]) { ret = -EINVAL; goto out_tunnel; } session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); if (!info->attrs[L2TP_ATTR_PEER_SESSION_ID]) { ret = -EINVAL; goto out_tunnel; } peer_session_id = nla_get_u32(info->attrs[L2TP_ATTR_PEER_SESSION_ID]); if (!info->attrs[L2TP_ATTR_PW_TYPE]) { ret = -EINVAL; goto out_tunnel; } cfg.pw_type = nla_get_u16(info->attrs[L2TP_ATTR_PW_TYPE]); if (cfg.pw_type >= __L2TP_PWTYPE_MAX) { ret = -EINVAL; goto out_tunnel; } /* L2TPv2 only accepts PPP pseudo-wires */ if (tunnel->version == 2 && cfg.pw_type != L2TP_PWTYPE_PPP) { ret = -EPROTONOSUPPORT; goto out_tunnel; } if (tunnel->version > 2) { if (info->attrs[L2TP_ATTR_L2SPEC_TYPE]) { cfg.l2specific_type = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_TYPE]); if (cfg.l2specific_type != L2TP_L2SPECTYPE_DEFAULT && cfg.l2specific_type != L2TP_L2SPECTYPE_NONE) { ret = -EINVAL; goto out_tunnel; } } else { cfg.l2specific_type = L2TP_L2SPECTYPE_DEFAULT; } if (info->attrs[L2TP_ATTR_COOKIE]) { u16 len = nla_len(info->attrs[L2TP_ATTR_COOKIE]); if (len > 8) { ret = -EINVAL; goto out_tunnel; } cfg.cookie_len = len; memcpy(&cfg.cookie[0], nla_data(info->attrs[L2TP_ATTR_COOKIE]), len); } if (info->attrs[L2TP_ATTR_PEER_COOKIE]) { u16 len = nla_len(info->attrs[L2TP_ATTR_PEER_COOKIE]); if (len > 8) { ret = -EINVAL; goto out_tunnel; } cfg.peer_cookie_len = len; memcpy(&cfg.peer_cookie[0], nla_data(info->attrs[L2TP_ATTR_PEER_COOKIE]), len); } if (info->attrs[L2TP_ATTR_IFNAME]) cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); } if (info->attrs[L2TP_ATTR_RECV_SEQ]) cfg.recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); if (info->attrs[L2TP_ATTR_SEND_SEQ]) cfg.send_seq = nla_get_u8(info->attrs[L2TP_ATTR_SEND_SEQ]); if (info->attrs[L2TP_ATTR_LNS_MODE]) cfg.lns_mode = nla_get_u8(info->attrs[L2TP_ATTR_LNS_MODE]); if (info->attrs[L2TP_ATTR_RECV_TIMEOUT]) cfg.reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]); #ifdef CONFIG_MODULES if (!l2tp_nl_cmd_ops[cfg.pw_type]) { genl_unlock(); request_module("net-l2tp-type-%u", cfg.pw_type); genl_lock(); } #endif if (!l2tp_nl_cmd_ops[cfg.pw_type] || !l2tp_nl_cmd_ops[cfg.pw_type]->session_create) { ret = -EPROTONOSUPPORT; goto out_tunnel; } ret = l2tp_nl_cmd_ops[cfg.pw_type]->session_create(net, tunnel, session_id, peer_session_id, &cfg); if (ret >= 0) { session = l2tp_session_get(net, tunnel->sock, tunnel->version, tunnel_id, session_id); if (session) { ret = l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_CREATE); l2tp_session_put(session); } } out_tunnel: l2tp_tunnel_put(tunnel); out: return ret; } static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *info) { int ret = 0; struct l2tp_session *session; u16 pw_type; session = l2tp_nl_session_get(info); if (!session) { ret = -ENODEV; goto out; } l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_DELETE); pw_type = session->pwtype; if (pw_type < __L2TP_PWTYPE_MAX) if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete) l2tp_nl_cmd_ops[pw_type]->session_delete(session); l2tp_session_put(session); out: return ret; } static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *info) { int ret = 0; struct l2tp_session *session; session = l2tp_nl_session_get(info); if (!session) { ret = -ENODEV; goto out; } if (info->attrs[L2TP_ATTR_RECV_SEQ]) session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); if (info->attrs[L2TP_ATTR_SEND_SEQ]) { struct l2tp_tunnel *tunnel = session->tunnel; session->send_seq = nla_get_u8(info->attrs[L2TP_ATTR_SEND_SEQ]); l2tp_session_set_header_len(session, tunnel->version, tunnel->encap); } if (info->attrs[L2TP_ATTR_LNS_MODE]) session->lns_mode = nla_get_u8(info->attrs[L2TP_ATTR_LNS_MODE]); if (info->attrs[L2TP_ATTR_RECV_TIMEOUT]) session->reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]); ret = l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_MODIFY); l2tp_session_put(session); out: return ret; } static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int flags, struct l2tp_session *session, u8 cmd) { void *hdr; struct nlattr *nest; struct l2tp_tunnel *tunnel = session->tunnel; hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) || nla_put_u32(skb, L2TP_ATTR_SESSION_ID, session->session_id) || nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) || nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID, session->peer_session_id) || nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) || nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype)) goto nla_put_failure; if ((session->ifname[0] && nla_put_string(skb, L2TP_ATTR_IFNAME, session->ifname)) || (session->cookie_len && nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len, session->cookie)) || (session->peer_cookie_len && nla_put(skb, L2TP_ATTR_PEER_COOKIE, session->peer_cookie_len, session->peer_cookie)) || nla_put_u8(skb, L2TP_ATTR_RECV_SEQ, session->recv_seq) || nla_put_u8(skb, L2TP_ATTR_SEND_SEQ, session->send_seq) || nla_put_u8(skb, L2TP_ATTR_LNS_MODE, session->lns_mode) || (l2tp_tunnel_uses_xfrm(tunnel) && nla_put_u8(skb, L2TP_ATTR_USING_IPSEC, 1)) || (session->reorder_timeout && nla_put_msecs(skb, L2TP_ATTR_RECV_TIMEOUT, session->reorder_timeout, L2TP_ATTR_PAD))) goto nla_put_failure; nest = nla_nest_start_noflag(skb, L2TP_ATTR_STATS); if (!nest) goto nla_put_failure; if (nla_put_u64_64bit(skb, L2TP_ATTR_TX_PACKETS, atomic_long_read(&session->stats.tx_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_TX_BYTES, atomic_long_read(&session->stats.tx_bytes), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_TX_ERRORS, atomic_long_read(&session->stats.tx_errors), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_PACKETS, atomic_long_read(&session->stats.rx_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_BYTES, atomic_long_read(&session->stats.rx_bytes), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS, atomic_long_read(&session->stats.rx_seq_discards), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_COOKIE_DISCARDS, atomic_long_read(&session->stats.rx_cookie_discards), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS, atomic_long_read(&session->stats.rx_oos_packets), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_ERRORS, atomic_long_read(&session->stats.rx_errors), L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_INVALID, atomic_long_read(&session->stats.rx_invalid), L2TP_ATTR_STATS_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -1; } static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info) { struct l2tp_session *session; struct sk_buff *msg; int ret; session = l2tp_nl_session_get(info); if (!session) { ret = -ENODEV; goto err; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto err_ref; } ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq, 0, session, L2TP_CMD_SESSION_GET); if (ret < 0) goto err_ref_msg; ret = genlmsg_unicast(genl_info_net(info), msg, info->snd_portid); l2tp_session_put(session); return ret; err_ref_msg: nlmsg_free(msg); err_ref: l2tp_session_put(session); err: return ret; } static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct l2tp_nl_cb_data *cbd = (void *)&cb->ctx[0]; struct net *net = sock_net(skb->sk); struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; unsigned long tkey = cbd->tkey; unsigned long skey = cbd->skey; for (;;) { if (!tunnel) { tunnel = l2tp_tunnel_get_next(net, &tkey); if (!tunnel) goto out; } session = l2tp_session_get_next(net, tunnel->sock, tunnel->version, tunnel->tunnel_id, &skey); if (!session) { tkey++; l2tp_tunnel_put(tunnel); tunnel = NULL; skey = 0; continue; } if (l2tp_nl_session_send(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, session, L2TP_CMD_SESSION_GET) < 0) { l2tp_session_put(session); l2tp_tunnel_put(tunnel); break; } l2tp_session_put(session); skey++; } out: cbd->tkey = tkey; cbd->skey = skey; return skb->len; } static const struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = { [L2TP_ATTR_NONE] = { .type = NLA_UNSPEC, }, [L2TP_ATTR_PW_TYPE] = { .type = NLA_U16, }, [L2TP_ATTR_ENCAP_TYPE] = { .type = NLA_U16, }, [L2TP_ATTR_OFFSET] = { .type = NLA_U16, }, [L2TP_ATTR_DATA_SEQ] = { .type = NLA_U8, }, [L2TP_ATTR_L2SPEC_TYPE] = { .type = NLA_U8, }, [L2TP_ATTR_L2SPEC_LEN] = { .type = NLA_U8, }, [L2TP_ATTR_PROTO_VERSION] = { .type = NLA_U8, }, [L2TP_ATTR_CONN_ID] = { .type = NLA_U32, }, [L2TP_ATTR_PEER_CONN_ID] = { .type = NLA_U32, }, [L2TP_ATTR_SESSION_ID] = { .type = NLA_U32, }, [L2TP_ATTR_PEER_SESSION_ID] = { .type = NLA_U32, }, [L2TP_ATTR_UDP_CSUM] = { .type = NLA_U8, }, [L2TP_ATTR_VLAN_ID] = { .type = NLA_U16, }, [L2TP_ATTR_DEBUG] = { .type = NLA_U32, }, [L2TP_ATTR_RECV_SEQ] = { .type = NLA_U8, }, [L2TP_ATTR_SEND_SEQ] = { .type = NLA_U8, }, [L2TP_ATTR_LNS_MODE] = { .type = NLA_U8, }, [L2TP_ATTR_USING_IPSEC] = { .type = NLA_U8, }, [L2TP_ATTR_RECV_TIMEOUT] = { .type = NLA_MSECS, }, [L2TP_ATTR_FD] = { .type = NLA_U32, }, [L2TP_ATTR_IP_SADDR] = { .type = NLA_U32, }, [L2TP_ATTR_IP_DADDR] = { .type = NLA_U32, }, [L2TP_ATTR_UDP_SPORT] = { .type = NLA_U16, }, [L2TP_ATTR_UDP_DPORT] = { .type = NLA_U16, }, [L2TP_ATTR_MTU] = { .type = NLA_U16, }, [L2TP_ATTR_MRU] = { .type = NLA_U16, }, [L2TP_ATTR_STATS] = { .type = NLA_NESTED, }, [L2TP_ATTR_IP6_SADDR] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr), }, [L2TP_ATTR_IP6_DADDR] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr), }, [L2TP_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1, }, [L2TP_ATTR_COOKIE] = { .type = NLA_BINARY, .len = 8, }, [L2TP_ATTR_PEER_COOKIE] = { .type = NLA_BINARY, .len = 8, }, }; static const struct genl_small_ops l2tp_nl_ops[] = { { .cmd = L2TP_CMD_NOOP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_noop, /* can be retrieved by unprivileged users */ }, { .cmd = L2TP_CMD_TUNNEL_CREATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_create, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_TUNNEL_DELETE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_delete, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_TUNNEL_MODIFY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_modify, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_TUNNEL_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_get, .dumpit = l2tp_nl_cmd_tunnel_dump, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_CREATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_create, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_DELETE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_delete, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_MODIFY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_modify, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_get, .dumpit = l2tp_nl_cmd_session_dump, .flags = GENL_UNS_ADMIN_PERM, }, }; static struct genl_family l2tp_nl_family __ro_after_init = { .name = L2TP_GENL_NAME, .version = L2TP_GENL_VERSION, .hdrsize = 0, .maxattr = L2TP_ATTR_MAX, .policy = l2tp_nl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = l2tp_nl_ops, .n_small_ops = ARRAY_SIZE(l2tp_nl_ops), .resv_start_op = L2TP_CMD_SESSION_GET + 1, .mcgrps = l2tp_multicast_group, .n_mcgrps = ARRAY_SIZE(l2tp_multicast_group), }; int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops) { int ret; ret = -EINVAL; if (pw_type >= __L2TP_PWTYPE_MAX) goto err; genl_lock(); ret = -EBUSY; if (l2tp_nl_cmd_ops[pw_type]) goto out; l2tp_nl_cmd_ops[pw_type] = ops; ret = 0; out: genl_unlock(); err: return ret; } EXPORT_SYMBOL_GPL(l2tp_nl_register_ops); void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type) { if (pw_type < __L2TP_PWTYPE_MAX) { genl_lock(); l2tp_nl_cmd_ops[pw_type] = NULL; genl_unlock(); } } EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops); static int __init l2tp_nl_init(void) { pr_info("L2TP netlink interface\n"); return genl_register_family(&l2tp_nl_family); } static void l2tp_nl_cleanup(void) { genl_unregister_family(&l2tp_nl_family); } module_init(l2tp_nl_init); module_exit(l2tp_nl_cleanup); MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("L2TP netlink"); MODULE_LICENSE("GPL"); MODULE_VERSION("1.0"); MODULE_ALIAS_GENL_FAMILY("l2tp"); |
| 39 39 39 21 15 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | // SPDX-License-Identifier: GPL-2.0-or-later /* Linux driver for Philips webcam Various miscellaneous functions and tables. (C) 1999-2003 Nemosoft Unv. (C) 2004-2006 Luc Saillard (luc@saillard.org) NOTE: this version of pwc is an unofficial (modified) release of pwc & pcwx driver and thus may have bugs that are not present in the original version. Please send bug reports and support requests to <luc@saillard.org>. The decompression routines have been implemented by reverse-engineering the Nemosoft binary pwcx module. Caveat emptor. */ #include "pwc.h" const int pwc_image_sizes[PSZ_MAX][2] = { { 128, 96 }, /* sqcif */ { 160, 120 }, /* qsif */ { 176, 144 }, /* qcif */ { 320, 240 }, /* sif */ { 352, 288 }, /* cif */ { 640, 480 }, /* vga */ }; /* x,y -> PSZ_ */ int pwc_get_size(struct pwc_device *pdev, int width, int height) { int i; /* Find the largest size supported by the camera that fits into the requested size. */ for (i = PSZ_MAX - 1; i >= 0; i--) { if (!(pdev->image_mask & (1 << i))) continue; if (pwc_image_sizes[i][0] <= width && pwc_image_sizes[i][1] <= height) return i; } /* No mode found, return the smallest mode we have */ for (i = 0; i < PSZ_MAX; i++) { if (pdev->image_mask & (1 << i)) return i; } /* Never reached there always is at least one supported mode */ return 0; } /* initialize variables depending on type and decompressor */ void pwc_construct(struct pwc_device *pdev) { if (DEVICE_USE_CODEC1(pdev->type)) { pdev->image_mask = 1 << PSZ_SQCIF | 1 << PSZ_QCIF | 1 << PSZ_CIF; pdev->vcinterface = 2; pdev->vendpoint = 4; pdev->frame_header_size = 0; pdev->frame_trailer_size = 0; } else if (DEVICE_USE_CODEC3(pdev->type)) { pdev->image_mask = 1 << PSZ_QSIF | 1 << PSZ_SIF | 1 << PSZ_VGA; pdev->vcinterface = 3; pdev->vendpoint = 5; pdev->frame_header_size = TOUCAM_HEADER_SIZE; pdev->frame_trailer_size = TOUCAM_TRAILER_SIZE; } else /* if (DEVICE_USE_CODEC2(pdev->type)) */ { pdev->image_mask = 1 << PSZ_SQCIF | 1 << PSZ_QSIF | 1 << PSZ_QCIF | 1 << PSZ_SIF | 1 << PSZ_CIF | 1 << PSZ_VGA; pdev->vcinterface = 3; pdev->vendpoint = 4; pdev->frame_header_size = 0; pdev->frame_trailer_size = 0; } } |
| 2 1 2 1 2 2 2 2 1 2 1 2 1 2 2 1 2 2 2 1 19 19 18 1 18 17 1 17 17 16 1 16 15 15 15 15 15 15 15 15 15 15 15 15 15 10 10 2 4 2 2 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 | // SPDX-License-Identifier: GPL-2.0-only #include <net/netdev_queues.h> #include "netlink.h" #include "common.h" struct rings_req_info { struct ethnl_req_info base; }; struct rings_reply_data { struct ethnl_reply_data base; struct ethtool_ringparam ringparam; struct kernel_ethtool_ringparam kernel_ringparam; u32 supported_ring_params; }; #define RINGS_REPDATA(__reply_base) \ container_of(__reply_base, struct rings_reply_data, base) const struct nla_policy ethnl_rings_get_policy[] = { [ETHTOOL_A_RINGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int rings_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct rings_reply_data *data = RINGS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; data->supported_ring_params = dev->ethtool_ops->supported_ring_params; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->kernel_ringparam.tcp_data_split = dev->cfg->hds_config; data->kernel_ringparam.hds_thresh = dev->cfg->hds_thresh; dev->ethtool_ops->get_ringparam(dev, &data->ringparam, &data->kernel_ringparam, info->extack); ethnl_ops_complete(dev); return 0; } static int rings_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u32)) + /* _RINGS_RX_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO */ nla_total_size(sizeof(u32)) + /* _RINGS_TX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_BUF_LEN */ nla_total_size(sizeof(u8)) + /* _RINGS_TCP_DATA_SPLIT */ nla_total_size(sizeof(u32) + /* _RINGS_CQE_SIZE */ nla_total_size(sizeof(u8)) + /* _RINGS_TX_PUSH */ nla_total_size(sizeof(u8))) + /* _RINGS_RX_PUSH */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN_MAX */ nla_total_size(sizeof(u32)) + /* _RINGS_HDS_THRESH */ nla_total_size(sizeof(u32)); /* _RINGS_HDS_THRESH_MAX*/ } static int rings_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct rings_reply_data *data = RINGS_REPDATA(reply_base); const struct kernel_ethtool_ringparam *kr = &data->kernel_ringparam; const struct ethtool_ringparam *ringparam = &data->ringparam; u32 supported_ring_params = data->supported_ring_params; WARN_ON(kr->tcp_data_split > ETHTOOL_TCP_DATA_SPLIT_ENABLED); if ((ringparam->rx_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MAX, ringparam->rx_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX, ringparam->rx_pending))) || (ringparam->rx_mini_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI_MAX, ringparam->rx_mini_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI, ringparam->rx_mini_pending))) || (ringparam->rx_jumbo_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO_MAX, ringparam->rx_jumbo_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO, ringparam->rx_jumbo_pending))) || (ringparam->tx_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_MAX, ringparam->tx_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_TX, ringparam->tx_pending))) || (kr->rx_buf_len && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, kr->rx_buf_len))) || (kr->tcp_data_split && (nla_put_u8(skb, ETHTOOL_A_RINGS_TCP_DATA_SPLIT, kr->tcp_data_split))) || (kr->cqe_size && (nla_put_u32(skb, ETHTOOL_A_RINGS_CQE_SIZE, kr->cqe_size))) || nla_put_u8(skb, ETHTOOL_A_RINGS_TX_PUSH, !!kr->tx_push) || nla_put_u8(skb, ETHTOOL_A_RINGS_RX_PUSH, !!kr->rx_push) || ((supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN) && (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, kr->tx_push_buf_max_len) || nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, kr->tx_push_buf_len))) || ((supported_ring_params & ETHTOOL_RING_USE_HDS_THRS) && (nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH, kr->hds_thresh) || nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH_MAX, kr->hds_thresh_max)))) return -EMSGSIZE; return 0; } /* RINGS_SET */ const struct nla_policy ethnl_rings_set_policy[] = { [ETHTOOL_A_RINGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_RINGS_RX] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_MINI] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_TX] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_BUF_LEN] = NLA_POLICY_MIN(NLA_U32, 1), [ETHTOOL_A_RINGS_TCP_DATA_SPLIT] = NLA_POLICY_MAX(NLA_U8, ETHTOOL_TCP_DATA_SPLIT_ENABLED), [ETHTOOL_A_RINGS_CQE_SIZE] = NLA_POLICY_MIN(NLA_U32, 1), [ETHTOOL_A_RINGS_TX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_RX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_HDS_THRESH] = { .type = NLA_U32 }, }; static int ethnl_set_rings_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct nlattr **tb = info->attrs; if (tb[ETHTOOL_A_RINGS_RX_BUF_LEN] && !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_BUF_LEN)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], "setting rx buf len not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TCP_DATA_SPLIT)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], "setting TCP data split is not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_HDS_THRESH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_HDS_THRS)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_HDS_THRESH], "setting hds-thresh is not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_CQE_SIZE] && !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_CQE_SIZE], "setting cqe size not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TX_PUSH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH], "setting tx push not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_RX_PUSH] && !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_PUSH)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_RX_PUSH], "setting rx push not supported"); return -EOPNOTSUPP; } if (tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] && !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], "setting tx push buf len is not supported"); return -EOPNOTSUPP; } return ops->get_ringparam && ops->set_ringparam ? 1 : -EOPNOTSUPP; } static int ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) { struct kernel_ethtool_ringparam kernel_ringparam = {}; struct ethtool_ringparam ringparam = {}; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; const struct nlattr *err_attr; bool mod = false; int ret; dev->ethtool_ops->get_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); kernel_ringparam.tcp_data_split = dev->cfg->hds_config; ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod); ethnl_update_u32(&ringparam.rx_mini_pending, tb[ETHTOOL_A_RINGS_RX_MINI], &mod); ethnl_update_u32(&ringparam.rx_jumbo_pending, tb[ETHTOOL_A_RINGS_RX_JUMBO], &mod); ethnl_update_u32(&ringparam.tx_pending, tb[ETHTOOL_A_RINGS_TX], &mod); ethnl_update_u32(&kernel_ringparam.rx_buf_len, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], &mod); ethnl_update_u8(&kernel_ringparam.tcp_data_split, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], &mod); ethnl_update_u32(&kernel_ringparam.cqe_size, tb[ETHTOOL_A_RINGS_CQE_SIZE], &mod); ethnl_update_u8(&kernel_ringparam.tx_push, tb[ETHTOOL_A_RINGS_TX_PUSH], &mod); ethnl_update_u8(&kernel_ringparam.rx_push, tb[ETHTOOL_A_RINGS_RX_PUSH], &mod); ethnl_update_u32(&kernel_ringparam.tx_push_buf_len, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], &mod); ethnl_update_u32(&kernel_ringparam.hds_thresh, tb[ETHTOOL_A_RINGS_HDS_THRESH], &mod); if (!mod) return 0; if (kernel_ringparam.tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_ENABLED && dev_xdp_sb_prog_count(dev)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], "tcp-data-split can not be enabled with single buffer XDP"); return -EINVAL; } if (dev_get_min_mp_channel_count(dev)) { if (kernel_ringparam.tcp_data_split != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { NL_SET_ERR_MSG(info->extack, "can't disable tcp-data-split while device has memory provider enabled"); return -EINVAL; } else if (kernel_ringparam.hds_thresh) { NL_SET_ERR_MSG(info->extack, "can't set non-zero hds_thresh while device is memory provider enabled"); return -EINVAL; } } /* ensure new ring parameters are within limits */ if (ringparam.rx_pending > ringparam.rx_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX]; else if (ringparam.rx_mini_pending > ringparam.rx_mini_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX_MINI]; else if (ringparam.rx_jumbo_pending > ringparam.rx_jumbo_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX_JUMBO]; else if (ringparam.tx_pending > ringparam.tx_max_pending) err_attr = tb[ETHTOOL_A_RINGS_TX]; else if (kernel_ringparam.hds_thresh > kernel_ringparam.hds_thresh_max) err_attr = tb[ETHTOOL_A_RINGS_HDS_THRESH]; else err_attr = NULL; if (err_attr) { NL_SET_ERR_MSG_ATTR(info->extack, err_attr, "requested ring size exceeds maximum"); return -EINVAL; } if (kernel_ringparam.tx_push_buf_len > kernel_ringparam.tx_push_buf_max_len) { NL_SET_ERR_MSG_ATTR_FMT(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], "Requested TX push buffer exceeds the maximum of %u", kernel_ringparam.tx_push_buf_max_len); return -EINVAL; } dev->cfg_pending->hds_config = kernel_ringparam.tcp_data_split; dev->cfg_pending->hds_thresh = kernel_ringparam.hds_thresh; ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_rings_request_ops = { .request_cmd = ETHTOOL_MSG_RINGS_GET, .reply_cmd = ETHTOOL_MSG_RINGS_GET_REPLY, .hdr_attr = ETHTOOL_A_RINGS_HEADER, .req_info_size = sizeof(struct rings_req_info), .reply_data_size = sizeof(struct rings_reply_data), .prepare_data = rings_prepare_data, .reply_size = rings_reply_size, .fill_reply = rings_fill_reply, .set_validate = ethnl_set_rings_validate, .set = ethnl_set_rings, .set_ntf_cmd = ETHTOOL_MSG_RINGS_NTF, }; |
| 7 7 6 2 2 2 2 2 2 2 2 2 36 36 36 36 36 36 18 18 40 40 11 36 35 7 7 7 7 7 1 1 2 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 | // SPDX-License-Identifier: GPL-2.0 /* * Some IBSS support code for cfg80211. * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2020-2024 Intel Corporation */ #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <linux/export.h> #include <net/cfg80211.h> #include "wext-compat.h" #include "nl80211.h" #include "rdev-ops.h" void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, struct ieee80211_channel *channel) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_bss *bss; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return; if (!wdev->u.ibss.ssid_len) return; bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, NULL, 0, IEEE80211_BSS_TYPE_IBSS, IEEE80211_PRIVACY_ANY); if (WARN_ON(!bss)) return; if (wdev->u.ibss.current_bss) { cfg80211_unhold_bss(wdev->u.ibss.current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->u.ibss.current_bss->pub); } cfg80211_hold_bss(bss_from_pub(bss)); wdev->u.ibss.current_bss = bss_from_pub(bss); cfg80211_upload_connect_keys(wdev); nl80211_send_ibss_bssid(wiphy_to_rdev(wdev->wiphy), dev, bssid, GFP_KERNEL); #ifdef CONFIG_CFG80211_WEXT memset(&wrqu, 0, sizeof(wrqu)); memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN); wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); #endif } void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, struct ieee80211_channel *channel, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; trace_cfg80211_ibss_joined(dev, bssid, channel); if (WARN_ON(!channel)) return; ev = kzalloc(sizeof(*ev), gfp); if (!ev) return; ev->type = EVENT_IBSS_JOINED; memcpy(ev->ij.bssid, bssid, ETH_ALEN); ev->ij.channel = channel; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_ibss_joined); int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ibss_params *params, struct cfg80211_cached_keys *connkeys) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_held(&rdev->wiphy.mtx); if (wdev->links[0].cac_started) return -EBUSY; if (wdev->u.ibss.ssid_len) return -EALREADY; if (!params->basic_rates) { /* * If no rates were explicitly configured, * use the mandatory rate set for 11b or * 11a for maximum compatibility. */ struct ieee80211_supported_band *sband; enum nl80211_band band; u32 flag; int j; band = params->chandef.chan->band; if (band == NL80211_BAND_5GHZ || band == NL80211_BAND_6GHZ) flag = IEEE80211_RATE_MANDATORY_A; else flag = IEEE80211_RATE_MANDATORY_B; sband = rdev->wiphy.bands[band]; for (j = 0; j < sband->n_bitrates; j++) { if (sband->bitrates[j].flags & flag) params->basic_rates |= BIT(j); } } if (WARN_ON(connkeys && connkeys->def < 0)) return -EINVAL; if (WARN_ON(wdev->connect_keys)) kfree_sensitive(wdev->connect_keys); wdev->connect_keys = connkeys; wdev->u.ibss.chandef = params->chandef; if (connkeys) { params->wep_keys = connkeys->params; params->wep_tx_key = connkeys->def; } #ifdef CONFIG_CFG80211_WEXT wdev->wext.ibss.chandef = params->chandef; #endif err = rdev_join_ibss(rdev, dev, params); if (err) { wdev->connect_keys = NULL; return err; } memcpy(wdev->u.ibss.ssid, params->ssid, params->ssid_len); wdev->u.ibss.ssid_len = params->ssid_len; return 0; } void cfg80211_clear_ibss(struct net_device *dev, bool nowext) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int i; lockdep_assert_wiphy(wdev->wiphy); kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; rdev_set_qos_map(rdev, dev, NULL); /* * Delete all the keys ... pairwise keys can't really * exist any more anyway, but default keys might. */ if (rdev->ops->del_key) for (i = 0; i < 6; i++) rdev_del_key(rdev, dev, -1, i, false, NULL); if (wdev->u.ibss.current_bss) { cfg80211_unhold_bss(wdev->u.ibss.current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->u.ibss.current_bss->pub); } wdev->u.ibss.current_bss = NULL; wdev->u.ibss.ssid_len = 0; memset(&wdev->u.ibss.chandef, 0, sizeof(wdev->u.ibss.chandef)); #ifdef CONFIG_CFG80211_WEXT if (!nowext) wdev->wext.ibss.ssid_len = 0; #endif cfg80211_sched_dfs_chan_update(rdev); } int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, bool nowext) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_wiphy(wdev->wiphy); if (!wdev->u.ibss.ssid_len) return -ENOLINK; err = rdev_leave_ibss(rdev, dev); if (err) return err; wdev->conn_owner_nlportid = 0; cfg80211_clear_ibss(dev, nowext); return 0; } #ifdef CONFIG_CFG80211_WEXT int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { struct cfg80211_cached_keys *ck = NULL; enum nl80211_band band; int i, err; lockdep_assert_wiphy(wdev->wiphy); if (!wdev->wext.ibss.beacon_interval) wdev->wext.ibss.beacon_interval = 100; /* try to find an IBSS channel if none requested ... */ if (!wdev->wext.ibss.chandef.chan) { struct ieee80211_channel *new_chan = NULL; for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; sband = rdev->wiphy.bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) { chan = &sband->channels[i]; if (chan->flags & IEEE80211_CHAN_NO_IR) continue; if (chan->flags & IEEE80211_CHAN_DISABLED) continue; new_chan = chan; break; } if (new_chan) break; } if (!new_chan) return -EINVAL; cfg80211_chandef_create(&wdev->wext.ibss.chandef, new_chan, NL80211_CHAN_NO_HT); } /* don't join -- SSID is not there */ if (!wdev->wext.ibss.ssid_len) return 0; if (!netif_running(wdev->netdev)) return 0; if (wdev->wext.keys) wdev->wext.keys->def = wdev->wext.default_key; wdev->wext.ibss.privacy = wdev->wext.default_key != -1; if (wdev->wext.keys && wdev->wext.keys->def != -1) { ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; for (i = 0; i < 4; i++) ck->params[i].key = ck->data[i]; } err = __cfg80211_join_ibss(rdev, wdev->netdev, &wdev->wext.ibss, ck); if (err) kfree(ck); return err; } int cfg80211_ibss_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *wextfreq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_channel *chan = NULL; int err, freq; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; if (!rdev->ops->join_ibss) return -EOPNOTSUPP; freq = cfg80211_wext_freq(wextfreq); if (freq < 0) return freq; if (freq) { chan = ieee80211_get_channel(wdev->wiphy, freq); if (!chan) return -EINVAL; if (chan->flags & IEEE80211_CHAN_NO_IR || chan->flags & IEEE80211_CHAN_DISABLED) return -EINVAL; } if (wdev->wext.ibss.chandef.chan == chan) return 0; err = 0; if (wdev->u.ibss.ssid_len) err = cfg80211_leave_ibss(rdev, dev, true); if (err) return err; if (chan) { cfg80211_chandef_create(&wdev->wext.ibss.chandef, chan, NL80211_CHAN_NO_HT); wdev->wext.ibss.channel_fixed = true; } else { /* cfg80211_ibss_wext_join will pick one if needed */ wdev->wext.ibss.channel_fixed = false; } return cfg80211_ibss_wext_join(rdev, wdev); } int cfg80211_ibss_wext_giwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *freq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_channel *chan = NULL; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; if (wdev->u.ibss.current_bss) chan = wdev->u.ibss.current_bss->pub.channel; else if (wdev->wext.ibss.chandef.chan) chan = wdev->wext.ibss.chandef.chan; if (chan) { freq->m = chan->center_freq; freq->e = 6; return 0; } /* no channel if not joining */ return -EINVAL; } int cfg80211_ibss_wext_siwessid(struct net_device *dev, struct iw_request_info *info, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); size_t len = data->length; int err; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; if (!rdev->ops->join_ibss) return -EOPNOTSUPP; err = 0; if (wdev->u.ibss.ssid_len) err = cfg80211_leave_ibss(rdev, dev, true); if (err) return err; /* iwconfig uses nul termination in SSID.. */ if (len > 0 && ssid[len - 1] == '\0') len--; memcpy(wdev->u.ibss.ssid, ssid, len); wdev->wext.ibss.ssid = wdev->u.ibss.ssid; wdev->wext.ibss.ssid_len = len; return cfg80211_ibss_wext_join(rdev, wdev); } int cfg80211_ibss_wext_giwessid(struct net_device *dev, struct iw_request_info *info, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; data->flags = 0; if (wdev->u.ibss.ssid_len) { data->flags = 1; data->length = wdev->u.ibss.ssid_len; memcpy(ssid, wdev->u.ibss.ssid, data->length); } else if (wdev->wext.ibss.ssid && wdev->wext.ibss.ssid_len) { data->flags = 1; data->length = wdev->wext.ibss.ssid_len; memcpy(ssid, wdev->wext.ibss.ssid, data->length); } return 0; } int cfg80211_ibss_wext_siwap(struct net_device *dev, struct iw_request_info *info, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *bssid = ap_addr->sa_data; int err; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; if (!rdev->ops->join_ibss) return -EOPNOTSUPP; if (ap_addr->sa_family != ARPHRD_ETHER) return -EINVAL; /* automatic mode */ if (is_zero_ether_addr(bssid) || is_broadcast_ether_addr(bssid)) bssid = NULL; if (bssid && !is_valid_ether_addr(bssid)) return -EINVAL; /* both automatic */ if (!bssid && !wdev->wext.ibss.bssid) return 0; /* fixed already - and no change */ if (wdev->wext.ibss.bssid && bssid && ether_addr_equal(bssid, wdev->wext.ibss.bssid)) return 0; err = 0; if (wdev->u.ibss.ssid_len) err = cfg80211_leave_ibss(rdev, dev, true); if (err) return err; if (bssid) { memcpy(wdev->wext.bssid, bssid, ETH_ALEN); wdev->wext.ibss.bssid = wdev->wext.bssid; } else wdev->wext.ibss.bssid = NULL; return cfg80211_ibss_wext_join(rdev, wdev); } int cfg80211_ibss_wext_giwap(struct net_device *dev, struct iw_request_info *info, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; /* call only for ibss! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_ADHOC)) return -EINVAL; ap_addr->sa_family = ARPHRD_ETHER; if (wdev->u.ibss.current_bss) memcpy(ap_addr->sa_data, wdev->u.ibss.current_bss->pub.bssid, ETH_ALEN); else if (wdev->wext.ibss.bssid) memcpy(ap_addr->sa_data, wdev->wext.ibss.bssid, ETH_ALEN); else eth_zero_addr(ap_addr->sa_data); return 0; } #endif |
| 12 12 12 12 10 11 12 12 11 7 12 12 12 12 12 12 12 12 12 7 11 12 12 10 10 12 12 12 12 10 11 11 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/namei.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/namei.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * Directory entry file type support and forward compatibility hooks * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 * Hash Tree Directory indexing (c) * Daniel Phillips, 2001 * Hash Tree Directory indexing porting * Christopher Li, 2002 * Hash Tree Directory indexing cleanup * Theodore Ts'o, 2002 */ #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/time.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/bio.h> #include <linux/iversion.h> #include <linux/unicode.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include <trace/events/ext4.h> /* * define how far ahead to read directories while searching them. */ #define NAMEI_RA_CHUNKS 2 #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) static struct buffer_head *ext4_append(handle_t *handle, struct inode *inode, ext4_lblk_t *block) { struct ext4_map_blocks map; struct buffer_head *bh; int err; if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && ((inode->i_size >> 10) >= EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) return ERR_PTR(-ENOSPC); *block = inode->i_size >> inode->i_sb->s_blocksize_bits; map.m_lblk = *block; map.m_len = 1; /* * We're appending new directory block. Make sure the block is not * allocated yet, otherwise we will end up corrupting the * directory. */ err = ext4_map_blocks(NULL, inode, &map, 0); if (err < 0) return ERR_PTR(err); if (err) { EXT4_ERROR_INODE(inode, "Logical block already allocated"); return ERR_PTR(-EFSCORRUPTED); } bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); if (IS_ERR(bh)) return bh; inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_mark_inode_dirty(handle, inode); if (err) goto out; BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto out; return bh; out: brelse(bh); ext4_std_error(inode->i_sb, err); return ERR_PTR(err); } static int ext4_dx_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent); /* * Hints to ext4_read_dirblock regarding whether we expect a directory * block being read to be an index block, or a block containing * directory entries (and if the latter, whether it was found via a * logical block in an htree index block). This is used to control * what sort of sanity checkinig ext4_read_dirblock() will do on the * directory block read from the storage device. EITHER will means * the caller doesn't know what kind of directory block will be read, * so no specific verification will be done. */ typedef enum { EITHER, INDEX, DIRENT, DIRENT_HTREE } dirblock_type_t; #define ext4_read_dirblock(inode, block, type) \ __ext4_read_dirblock((inode), (block), (type), __func__, __LINE__) static struct buffer_head *__ext4_read_dirblock(struct inode *inode, ext4_lblk_t block, dirblock_type_t type, const char *func, unsigned int line) { struct buffer_head *bh; struct ext4_dir_entry *dirent; int is_dx_block = 0; if (block >= inode->i_size >> inode->i_blkbits) { ext4_error_inode(inode, func, line, block, "Attempting to read directory block (%u) that is past i_size (%llu)", block, inode->i_size); return ERR_PTR(-EFSCORRUPTED); } if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO)) bh = ERR_PTR(-EIO); else bh = ext4_bread(NULL, inode, block, 0); if (IS_ERR(bh)) { __ext4_warning(inode->i_sb, func, line, "inode #%lu: lblock %lu: comm %s: " "error %ld reading directory block", inode->i_ino, (unsigned long)block, current->comm, PTR_ERR(bh)); return bh; } /* The first directory block must not be a hole. */ if (!bh && (type == INDEX || type == DIRENT_HTREE || block == 0)) { ext4_error_inode(inode, func, line, block, "Directory hole found for htree %s block %u", (type == INDEX) ? "index" : "leaf", block); return ERR_PTR(-EFSCORRUPTED); } if (!bh) return NULL; dirent = (struct ext4_dir_entry *) bh->b_data; /* Determine whether or not we have an index block */ if (is_dx(inode)) { if (block == 0) is_dx_block = 1; else if (ext4_rec_len_from_disk(dirent->rec_len, inode->i_sb->s_blocksize) == inode->i_sb->s_blocksize) is_dx_block = 1; } if (!is_dx_block && type == INDEX) { ext4_error_inode(inode, func, line, block, "directory leaf block found instead of index block"); brelse(bh); return ERR_PTR(-EFSCORRUPTED); } if (!ext4_has_metadata_csum(inode->i_sb) || buffer_verified(bh)) return bh; /* * An empty leaf block can get mistaken for a index block; for * this reason, we can only check the index checksum when the * caller is sure it should be an index block. */ if (is_dx_block && type == INDEX) { if (ext4_dx_csum_verify(inode, dirent) && !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { ext4_error_inode_err(inode, func, line, block, EFSBADCRC, "Directory index failed checksum"); brelse(bh); return ERR_PTR(-EFSBADCRC); } } if (!is_dx_block) { if (ext4_dirblock_csum_verify(inode, bh) && !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { ext4_error_inode_err(inode, func, line, block, EFSBADCRC, "Directory block failed checksum"); brelse(bh); return ERR_PTR(-EFSBADCRC); } } return bh; } #ifdef DX_DEBUG #define dxtrace(command) command #else #define dxtrace(command) #endif struct fake_dirent { __le32 inode; __le16 rec_len; u8 name_len; u8 file_type; }; struct dx_countlimit { __le16 limit; __le16 count; }; struct dx_entry { __le32 hash; __le32 block; }; /* * dx_root_info is laid out so that if it should somehow get overlaid by a * dirent the two low bits of the hash version will be zero. Therefore, the * hash version mod 4 should never be 0. Sincerely, the paranoia department. */ struct dx_root { struct fake_dirent dot; char dot_name[4]; struct fake_dirent dotdot; char dotdot_name[4]; struct dx_root_info { __le32 reserved_zero; u8 hash_version; u8 info_length; /* 8 */ u8 indirect_levels; u8 unused_flags; } info; struct dx_entry entries[]; }; struct dx_node { struct fake_dirent fake; struct dx_entry entries[]; }; struct dx_frame { struct buffer_head *bh; struct dx_entry *entries; struct dx_entry *at; }; struct dx_map_entry { u32 hash; u16 offs; u16 size; }; /* * This goes at the end of each htree block. */ struct dx_tail { u32 dt_reserved; __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */ }; static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); static inline unsigned dx_get_hash(struct dx_entry *entry); static void dx_set_hash(struct dx_entry *entry, unsigned value); static unsigned dx_get_count(struct dx_entry *entries); static unsigned dx_get_limit(struct dx_entry *entries); static void dx_set_count(struct dx_entry *entries, unsigned value); static void dx_set_limit(struct dx_entry *entries, unsigned value); static unsigned dx_root_limit(struct inode *dir, unsigned infosize); static unsigned dx_node_limit(struct inode *dir); static struct dx_frame *dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame); static void dx_release(struct dx_frame *frames); static int dx_make_map(struct inode *dir, struct buffer_head *bh, struct dx_hash_info *hinfo, struct dx_map_entry *map_tail); static void dx_sort_map(struct dx_map_entry *map, unsigned count); static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from, char *to, struct dx_map_entry *offsets, int count, unsigned int blocksize); static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, unsigned int blocksize); static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block); static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, struct dx_frame *frames, __u32 *start_hash); static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); /* checksumming functions */ void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize) { struct ext4_dir_entry_tail *t = EXT4_DIRENT_TAIL(bh->b_data, blocksize); memset(t, 0, sizeof(struct ext4_dir_entry_tail)); t->det_rec_len = ext4_rec_len_to_disk( sizeof(struct ext4_dir_entry_tail), blocksize); t->det_reserved_ft = EXT4_FT_DIR_CSUM; } /* Walk through a dirent block to find a checksum "dirent" at the tail */ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); #ifdef PARANOID struct ext4_dir_entry *d, *top; d = (struct ext4_dir_entry *)bh->b_data; top = (struct ext4_dir_entry *)(bh->b_data + (blocksize - sizeof(struct ext4_dir_entry_tail))); while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize)) d = (struct ext4_dir_entry *)(((void *)d) + ext4_rec_len_from_disk(d->rec_len, blocksize)); if (d != top) return NULL; t = (struct ext4_dir_entry_tail *)d; #else t = EXT4_DIRENT_TAIL(bh->b_data, EXT4_BLOCK_SIZE(inode->i_sb)); #endif if (t->det_reserved_zero1 || (ext4_rec_len_from_disk(t->det_rec_len, blocksize) != sizeof(struct ext4_dir_entry_tail)) || t->det_reserved_zero2 || t->det_reserved_ft != EXT4_FT_DIR_CSUM) return NULL; return t; } static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); return cpu_to_le32(csum); } #define warn_no_space_for_csum(inode) \ __warn_no_space_for_csum((inode), __func__, __LINE__) static void __warn_no_space_for_csum(struct inode *inode, const char *func, unsigned int line) { __ext4_warning_inode(inode, func, line, "No space for directory leaf checksum. Please run e2fsck -D."); } int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; if (!ext4_has_metadata_csum(inode->i_sb)) return 1; t = get_dirent_tail(inode, bh); if (!t) { warn_no_space_for_csum(inode); return 0; } if (t->det_checksum != ext4_dirblock_csum(inode, bh->b_data, (char *)t - bh->b_data)) return 0; return 1; } static void ext4_dirblock_csum_set(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; if (!ext4_has_metadata_csum(inode->i_sb)) return; t = get_dirent_tail(inode, bh); if (!t) { warn_no_space_for_csum(inode); return; } t->det_checksum = ext4_dirblock_csum(inode, bh->b_data, (char *)t - bh->b_data); } int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *bh) { ext4_dirblock_csum_set(inode, bh); return ext4_handle_dirty_metadata(handle, inode, bh); } static struct dx_countlimit *get_dx_countlimit(struct inode *inode, struct ext4_dir_entry *dirent, int *offset) { struct ext4_dir_entry *dp; struct dx_root_info *root; int count_offset; int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize); if (rlen == blocksize) count_offset = 8; else if (rlen == 12) { dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12) return NULL; root = (struct dx_root_info *)(((void *)dp + 12)); if (root->reserved_zero || root->info_length != sizeof(struct dx_root_info)) return NULL; count_offset = 32; } else return NULL; if (offset) *offset = count_offset; return (struct dx_countlimit *)(((void *)dirent) + count_offset); } static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int count_offset, int count, struct dx_tail *t) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; int size; __u32 dummy_csum = 0; int offset = offsetof(struct dx_tail, dt_checksum); size = count_offset + (count * sizeof(struct dx_entry)); csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); csum = ext4_chksum(sbi, csum, (__u8 *)t, offset); csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); return cpu_to_le32(csum); } static int ext4_dx_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) { struct dx_countlimit *c; struct dx_tail *t; int count_offset, limit, count; if (!ext4_has_metadata_csum(inode->i_sb)) return 1; c = get_dx_countlimit(inode, dirent, &count_offset); if (!c) { EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); return 0; } limit = le16_to_cpu(c->limit); count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { warn_no_space_for_csum(inode); return 0; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset, count, t)) return 0; return 1; } static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) { struct dx_countlimit *c; struct dx_tail *t; int count_offset, limit, count; if (!ext4_has_metadata_csum(inode->i_sb)) return; c = get_dx_countlimit(inode, dirent, &count_offset); if (!c) { EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); return; } limit = le16_to_cpu(c->limit); count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { warn_no_space_for_csum(inode); return; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t); } static inline int ext4_handle_dirty_dx_node(handle_t *handle, struct inode *inode, struct buffer_head *bh) { ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); return ext4_handle_dirty_metadata(handle, inode, bh); } /* * p is at least 6 bytes before the end of page */ static inline struct ext4_dir_entry_2 * ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) { return (struct ext4_dir_entry_2 *)((char *)p + ext4_rec_len_from_disk(p->rec_len, blocksize)); } /* * Future: use high four bits of block for coalesce-on-delete flags * Mask them off for now. */ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) { return le32_to_cpu(entry->block) & 0x0fffffff; } static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) { entry->block = cpu_to_le32(value); } static inline unsigned dx_get_hash(struct dx_entry *entry) { return le32_to_cpu(entry->hash); } static inline void dx_set_hash(struct dx_entry *entry, unsigned value) { entry->hash = cpu_to_le32(value); } static inline unsigned dx_get_count(struct dx_entry *entries) { return le16_to_cpu(((struct dx_countlimit *) entries)->count); } static inline unsigned dx_get_limit(struct dx_entry *entries) { return le16_to_cpu(((struct dx_countlimit *) entries)->limit); } static inline void dx_set_count(struct dx_entry *entries, unsigned value) { ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); } static inline void dx_set_limit(struct dx_entry *entries, unsigned value) { ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); } static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) { unsigned int entry_space = dir->i_sb->s_blocksize - ext4_dir_rec_len(1, NULL) - ext4_dir_rec_len(2, NULL) - infosize; if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } static inline unsigned dx_node_limit(struct inode *dir) { unsigned int entry_space = dir->i_sb->s_blocksize - ext4_dir_rec_len(0, dir); if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } /* * Debug */ #ifdef DX_DEBUG static void dx_show_index(char * label, struct dx_entry *entries) { int i, n = dx_get_count (entries); printk(KERN_DEBUG "%s index", label); for (i = 0; i < n; i++) { printk(KERN_CONT " %x->%lu", i ? dx_get_hash(entries + i) : 0, (unsigned long)dx_get_block(entries + i)); } printk(KERN_CONT "\n"); } struct stats { unsigned names; unsigned space; unsigned bcount; }; static struct stats dx_show_leaf(struct inode *dir, struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de, int size, int show_names) { unsigned names = 0, space = 0; char *base = (char *) de; struct dx_hash_info h = *hinfo; printk("names: "); while ((char *) de < base + size) { if (de->inode) { if (show_names) { #ifdef CONFIG_FS_ENCRYPTION int len; char *name; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0); int res = 0; name = de->name; len = de->name_len; if (!IS_ENCRYPTED(dir)) { /* Directory is not encrypted */ (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:(U)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); } else { struct fscrypt_str de_name = FSTR_INIT(name, len); /* Directory is encrypted */ res = fscrypt_fname_alloc_buffer( len, &fname_crypto_str); if (res) printk(KERN_WARNING "Error " "allocating crypto " "buffer--skipping " "crypto\n"); res = fscrypt_fname_disk_to_usr(dir, 0, 0, &de_name, &fname_crypto_str); if (res) { printk(KERN_WARNING "Error " "converting filename " "from disk to usr" "\n"); name = "??"; len = 2; } else { name = fname_crypto_str.name; len = fname_crypto_str.len; } if (IS_CASEFOLDED(dir)) h.hash = EXT4_DIRENT_HASH(de); else (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); fscrypt_fname_free_buffer( &fname_crypto_str); } #else int len = de->name_len; char *name = de->name; (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); #endif } space += ext4_dir_rec_len(de->name_len, dir); names++; } de = ext4_next_entry(de, size); } printk(KERN_CONT "(%i)\n", names); return (struct stats) { names, space, 1 }; } struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, struct dx_entry *entries, int levels) { unsigned blocksize = dir->i_sb->s_blocksize; unsigned count = dx_get_count(entries), names = 0, space = 0, i; unsigned bcount = 0; struct buffer_head *bh; printk("%i indexed blocks...\n", count); for (i = 0; i < count; i++, entries++) { ext4_lblk_t block = dx_get_block(entries); ext4_lblk_t hash = i ? dx_get_hash(entries): 0; u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; struct stats stats; printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); bh = ext4_bread(NULL,dir, block, 0); if (!bh || IS_ERR(bh)) continue; stats = levels? dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); names += stats.names; space += stats.space; bcount += stats.bcount; brelse(bh); } if (bcount) printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", levels ? "" : " ", names, space/bcount, (space/bcount)*100/blocksize); return (struct stats) { names, space, bcount}; } /* * Linear search cross check */ static inline void htree_rep_invariant_check(struct dx_entry *at, struct dx_entry *target, u32 hash, unsigned int n) { while (n--) { dxtrace(printk(KERN_CONT ",")); if (dx_get_hash(++at) > hash) { at--; break; } } ASSERT(at == target - 1); } #else /* DX_DEBUG */ static inline void htree_rep_invariant_check(struct dx_entry *at, struct dx_entry *target, u32 hash, unsigned int n) { } #endif /* DX_DEBUG */ /* * Probe for a directory leaf block to search. * * dx_probe can return ERR_BAD_DX_DIR, which means there was a format * error in the directory index, and the caller should fall back to * searching the directory normally. The callers of dx_probe **MUST** * check for this error code, and make sure it never gets reflected * back to userspace. */ static struct dx_frame * dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in) { unsigned count, indirect, level, i; struct dx_entry *at, *entries, *p, *q, *m; struct dx_root *root; struct dx_frame *frame = frame_in; struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); u32 hash; ext4_lblk_t block; ext4_lblk_t blocks[EXT4_HTREE_LEVEL]; memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); frame->bh = ext4_read_dirblock(dir, 0, INDEX); if (IS_ERR(frame->bh)) return (struct dx_frame *) frame->bh; root = (struct dx_root *) frame->bh->b_data; if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY && root->info.hash_version != DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Unrecognised inode hash code %u", root->info.hash_version); goto fail; } if (ext4_hash_in_dirent(dir)) { if (root->info.hash_version != DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Hash in dirent, but hash is not SIPHASH"); goto fail; } } else { if (root->info.hash_version == DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Hash code is SIPHASH, but hash not in dirent"); goto fail; } } if (fname) hinfo = &fname->hinfo; hinfo->hash_version = root->info.hash_version; if (hinfo->hash_version <= DX_HASH_TEA) hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; /* hash is already computed for encrypted casefolded directory */ if (fname && fname_name(fname) && !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) { int ret = ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo); if (ret < 0) { ret_err = ERR_PTR(ret); goto fail; } } hash = hinfo->hash; if (root->info.unused_flags & 1) { ext4_warning_inode(dir, "Unimplemented hash flags: %#06x", root->info.unused_flags); goto fail; } indirect = root->info.indirect_levels; if (indirect >= ext4_dir_htree_level(dir->i_sb)) { ext4_warning(dir->i_sb, "Directory (ino: %lu) htree depth %#06x exceed" "supported value", dir->i_ino, ext4_dir_htree_level(dir->i_sb)); if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { ext4_warning(dir->i_sb, "Enable large directory " "feature to access it"); } goto fail; } entries = (struct dx_entry *)(((char *)&root->info) + root->info.info_length); if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", dx_get_limit(entries), dx_root_limit(dir, root->info.info_length)); goto fail; } dxtrace(printk("Look up %x", hash)); level = 0; blocks[0] = 0; while (1) { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { ext4_warning_inode(dir, "dx entry: count %u beyond limit %u", count, dx_get_limit(entries)); goto fail; } p = entries + 1; q = entries + count - 1; while (p <= q) { m = p + (q - p) / 2; dxtrace(printk(KERN_CONT ".")); if (dx_get_hash(m) > hash) q = m - 1; else p = m + 1; } htree_rep_invariant_check(entries, p, hash, count - 1); at = p - 1; dxtrace(printk(KERN_CONT " %x->%u\n", at == entries ? 0 : dx_get_hash(at), dx_get_block(at))); frame->entries = entries; frame->at = at; block = dx_get_block(at); for (i = 0; i <= level; i++) { if (blocks[i] == block) { ext4_warning_inode(dir, "dx entry: tree cycle block %u points back to block %u", blocks[level], block); goto fail; } } if (++level > indirect) return frame; blocks[level] = block; frame++; frame->bh = ext4_read_dirblock(dir, block, INDEX); if (IS_ERR(frame->bh)) { ret_err = (struct dx_frame *) frame->bh; frame->bh = NULL; goto fail; } entries = ((struct dx_node *) frame->bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit(dir)) { ext4_warning_inode(dir, "dx entry: limit %u != node limit %u", dx_get_limit(entries), dx_node_limit(dir)); goto fail; } } fail: while (frame >= frame_in) { brelse(frame->bh); frame--; } if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) ext4_warning_inode(dir, "Corrupt directory, running e2fsck is recommended"); return ret_err; } static void dx_release(struct dx_frame *frames) { struct dx_root_info *info; int i; unsigned int indirect_levels; if (frames[0].bh == NULL) return; info = &((struct dx_root *)frames[0].bh->b_data)->info; /* save local copy, "info" may be freed after brelse() */ indirect_levels = info->indirect_levels; for (i = 0; i <= indirect_levels; i++) { if (frames[i].bh == NULL) break; brelse(frames[i].bh); frames[i].bh = NULL; } } /* * This function increments the frame pointer to search the next leaf * block, and reads in the necessary intervening nodes if the search * should be necessary. Whether or not the search is necessary is * controlled by the hash parameter. If the hash value is even, then * the search is only continued if the next block starts with that * hash value. This is used if we are searching for a specific file. * * If the hash value is HASH_NB_ALWAYS, then always go to the next block. * * This function returns 1 if the caller should continue to search, * or 0 if it should not. If there is an error reading one of the * index blocks, it will a negative error code. * * If start_hash is non-null, it will be filled in with the starting * hash of the next page. */ static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, struct dx_frame *frames, __u32 *start_hash) { struct dx_frame *p; struct buffer_head *bh; int num_frames = 0; __u32 bhash; p = frame; /* * Find the next leaf page by incrementing the frame pointer. * If we run out of entries in the interior node, loop around and * increment pointer in the parent node. When we break out of * this loop, num_frames indicates the number of interior * nodes need to be read. */ while (1) { if (++(p->at) < p->entries + dx_get_count(p->entries)) break; if (p == frames) return 0; num_frames++; p--; } /* * If the hash is 1, then continue only if the next page has a * continuation hash of any value. This is used for readdir * handling. Otherwise, check to see if the hash matches the * desired continuation hash. If it doesn't, return since * there's no point to read in the successive index pages. */ bhash = dx_get_hash(p->at); if (start_hash) *start_hash = bhash; if ((hash & 1) == 0) { if ((bhash & ~1) != hash) return 0; } /* * If the hash is HASH_NB_ALWAYS, we always go to the next * block so no check is necessary */ while (num_frames--) { bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); if (IS_ERR(bh)) return PTR_ERR(bh); p++; brelse(p->bh); p->bh = bh; p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; } return 1; } /* * This function fills a red-black tree with information from a * directory block. It returns the number directory entries loaded * into the tree. If there is an error it is returned in err. */ static int htree_dirblock_to_tree(struct file *dir_file, struct inode *dir, ext4_lblk_t block, struct dx_hash_info *hinfo, __u32 start_hash, __u32 start_minor_hash) { struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; int csum = ext4_has_metadata_csum(dir->i_sb); dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); if (IS_ERR(bh)) return PTR_ERR(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; /* csum entries are not larger in the casefolded encrypted case */ top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - ext4_dir_rec_len(0, csum ? NULL : dir)); /* Check if the directory is encrypted */ if (IS_ENCRYPTED(dir)) { err = fscrypt_prepare_readdir(dir); if (err < 0) { brelse(bh); return err; } err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fname_crypto_str); if (err < 0) { brelse(bh); return err; } } for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) + ((char *)de - bh->b_data))) { /* silently ignore the rest of the block */ break; } if (ext4_hash_in_dirent(dir)) { if (de->name_len && de->inode) { hinfo->hash = EXT4_DIRENT_HASH(de); hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); } else { hinfo->hash = 0; hinfo->minor_hash = 0; } } else { err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo); if (err < 0) { count = err; goto errout; } } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) continue; if (de->inode == 0) continue; if (!IS_ENCRYPTED(dir)) { tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &tmp_str); } else { int save_len = fname_crypto_str.len; struct fscrypt_str de_name = FSTR_INIT(de->name, de->name_len); /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(dir, hinfo->hash, hinfo->minor_hash, &de_name, &fname_crypto_str); if (err) { count = err; goto errout; } err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &fname_crypto_str); fname_crypto_str.len = save_len; } if (err != 0) { count = err; goto errout; } count++; } errout: brelse(bh); fscrypt_fname_free_buffer(&fname_crypto_str); return count; } /* * This function fills a red-black tree with information from a * directory. We start scanning the directory in hash order, starting * at start_hash and start_minor_hash. * * This function returns the number of entries inserted into the tree, * or a negative error code. */ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash) { struct dx_hash_info hinfo; struct ext4_dir_entry_2 *de; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct inode *dir; ext4_lblk_t block; int count = 0; int ret, err; __u32 hashval; struct fscrypt_str tmp_str; dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); dir = file_inode(dir_file); if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { if (ext4_hash_in_dirent(dir)) hinfo.hash_version = DX_HASH_SIPHASH; else hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; if (hinfo.hash_version <= DX_HASH_TEA) hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; if (ext4_has_inline_data(dir)) { int has_inline_data = 1; count = ext4_inlinedir_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash, &has_inline_data); if (has_inline_data) { *next_hash = ~0; return count; } } count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); *next_hash = ~0; return count; } hinfo.hash = start_hash; hinfo.minor_hash = 0; frame = dx_probe(NULL, dir, &hinfo, frames); if (IS_ERR(frame)) return PTR_ERR(frame); /* Add '.' and '..' from the htree header */ if (!start_hash && !start_minor_hash) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, 0, 0, de, &tmp_str); if (err != 0) goto errout; count++; } if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; de = ext4_next_entry(de, dir->i_sb->s_blocksize); tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, 2, 0, de, &tmp_str); if (err != 0) goto errout; count++; } while (1) { if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto errout; } cond_resched(); block = dx_get_block(frame->at); ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, start_hash, start_minor_hash); if (ret < 0) { err = ret; goto errout; } count += ret; hashval = ~0; ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, frame, frames, &hashval); *next_hash = hashval; if (ret < 0) { err = ret; goto errout; } /* * Stop if: (a) there are no more entries, or * (b) we have inserted at least one entry and the * next hash value is not a continuation */ if ((ret == 0) || (count && ((hashval & 1) == 0))) break; } dx_release(frames); dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, " "next hash: %x\n", count, *next_hash)); return count; errout: dx_release(frames); return (err); } static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, fname, offset, res_dir); } /* * Directory block splitting, compacting */ /* * Create map of hash values, offsets, and sizes, stored at end of block. * Returns number of entries mapped. */ static int dx_make_map(struct inode *dir, struct buffer_head *bh, struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) { int count = 0; struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)bh->b_data; unsigned int buflen = bh->b_size; char *base = bh->b_data; struct dx_hash_info h = *hinfo; int blocksize = EXT4_BLOCK_SIZE(dir->i_sb); if (ext4_has_metadata_csum(dir->i_sb)) buflen -= sizeof(struct ext4_dir_entry_tail); while ((char *) de < base + buflen) { if (ext4_check_dir_entry(dir, NULL, de, bh, base, buflen, ((char *)de) - base)) return -EFSCORRUPTED; if (de->name_len && de->inode) { if (ext4_hash_in_dirent(dir)) h.hash = EXT4_DIRENT_HASH(de); else { int err = ext4fs_dirhash(dir, de->name, de->name_len, &h); if (err < 0) return err; } map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; map_tail->size = ext4_rec_len_from_disk(de->rec_len, blocksize); count++; cond_resched(); } de = ext4_next_entry(de, blocksize); } return count; } /* Sort map by hash value */ static void dx_sort_map (struct dx_map_entry *map, unsigned count) { struct dx_map_entry *p, *q, *top = map + count - 1; int more; /* Combsort until bubble sort doesn't suck */ while (count > 2) { count = count*10/13; if (count - 9 < 2) /* 9, 10 -> 11 */ count = 11; for (p = top, q = p - count; q >= map; p--, q--) if (p->hash < q->hash) swap(*p, *q); } /* Garden variety bubble sort */ do { more = 0; q = top; while (q-- > map) { if (q[1].hash >= q[0].hash) continue; swap(*(q+1), *q); more = 1; } } while(more); } static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) { struct dx_entry *entries = frame->entries; struct dx_entry *old = frame->at, *new = old + 1; int count = dx_get_count(entries); ASSERT(count < dx_get_limit(entries)); ASSERT(old < entries + count); memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); dx_set_hash(new, hash); dx_set_block(new, block); dx_set_count(entries, count + 1); } #if IS_ENABLED(CONFIG_UNICODE) int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *name) { struct qstr *cf_name = &name->cf_name; unsigned char *buf; struct dx_hash_info *hinfo = &name->hinfo; int len; if (!IS_CASEFOLDED(dir) || (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) { cf_name->name = NULL; return 0; } buf = kmalloc(EXT4_NAME_LEN, GFP_NOFS); if (!buf) return -ENOMEM; len = utf8_casefold(dir->i_sb->s_encoding, iname, buf, EXT4_NAME_LEN); if (len <= 0) { kfree(buf); buf = NULL; } cf_name->name = buf; cf_name->len = (unsigned) len; if (!IS_ENCRYPTED(dir)) return 0; hinfo->hash_version = DX_HASH_SIPHASH; hinfo->seed = NULL; if (cf_name->name) return ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); else return ext4fs_dirhash(dir, iname->name, iname->len, hinfo); } #endif /* * Test whether a directory entry matches the filename being searched for. * * Return: %true if the directory entry matches, otherwise %false. */ static bool ext4_match(struct inode *parent, const struct ext4_filename *fname, struct ext4_dir_entry_2 *de) { struct fscrypt_name f; if (!de->inode) return false; f.usr_fname = fname->usr_fname; f.disk_name = fname->disk_name; #ifdef CONFIG_FS_ENCRYPTION f.crypto_buf = fname->crypto_buf; #endif #if IS_ENABLED(CONFIG_UNICODE) if (IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { /* * Just checking IS_ENCRYPTED(parent) below is not * sufficient to decide whether one can use the hash for * skipping the string comparison, because the key might * have been added right after * ext4_fname_setup_ci_filename(). In this case, a hash * mismatch will be a false negative. Therefore, make * sure cf_name was properly initialized before * considering the calculated hash. */ if (IS_ENCRYPTED(parent) && fname->cf_name.name && (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de))) return false; /* * Treat comparison errors as not a match. The * only case where it happens is on a disk * corruption or ENOMEM. */ return generic_ci_match(parent, fname->usr_fname, &fname->cf_name, de->name, de->name_len) > 0; } #endif return fscrypt_match_name(&f, de->name, de->name_len); } /* * Returns 0 if not found, -EFSCORRUPTED on failure, and 1 on success */ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct inode *dir, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; int de_len; de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ if (de->name + de->name_len <= dlimit && ext4_match(dir, fname, de)) { /* found a match - just to be sure, do * a full check */ if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf, buf_size, offset)) return -EFSCORRUPTED; *res_dir = de; return 1; } /* prevent looping on a bad block */ de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); if (de_len <= 0) return -EFSCORRUPTED; offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } return 0; } static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, struct ext4_dir_entry *de) { struct super_block *sb = dir->i_sb; if (!is_dx(dir)) return 0; if (block == 0) return 1; if (de->inode == 0 && ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) == sb->s_blocksize) return 1; return 0; } /* * __ext4_find_entry() * * finds an entry in the specified directory with the wanted name. It * returns the cache buffer in which the entry was found, and the entry * itself (as a parameter - res_dir). It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. * * The returned buffer_head has ->b_count elevated. The caller is expected * to brelse() it when appropriate. */ static struct buffer_head *__ext4_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir, int *inlined) { struct super_block *sb; struct buffer_head *bh_use[NAMEI_RA_SIZE]; struct buffer_head *bh, *ret = NULL; ext4_lblk_t start, block; const u8 *name = fname->usr_fname->name; size_t ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ size_t ra_ptr = 0; /* Current index into readahead buffer */ ext4_lblk_t nblocks; int i, namelen, retval; *res_dir = NULL; sb = dir->i_sb; namelen = fname->usr_fname->len; if (namelen > EXT4_NAME_LEN) return NULL; if (ext4_has_inline_data(dir)) { int has_inline_data = 1; ret = ext4_find_inline_entry(dir, fname, res_dir, &has_inline_data); if (inlined) *inlined = has_inline_data; if (has_inline_data || IS_ERR(ret)) goto cleanup_and_exit; } if ((namelen <= 2) && (name[0] == '.') && (name[1] == '.' || name[1] == '\0')) { /* * "." or ".." will only be in the first block * NFS may look up ".."; "." should be handled by the VFS */ block = start = 0; nblocks = 1; goto restart; } if (is_dx(dir)) { ret = ext4_dx_find_entry(dir, fname, res_dir); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the * old fashioned way. */ if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR) goto cleanup_and_exit; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); ret = NULL; } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); if (!nblocks) { ret = NULL; goto cleanup_and_exit; } start = EXT4_I(dir)->i_dir_start_lookup; if (start >= nblocks) start = 0; block = start; restart: do { /* * We deal with the read-ahead logic here. */ cond_resched(); if (ra_ptr >= ra_max) { /* Refill the readahead buffer */ ra_ptr = 0; if (block < start) ra_max = start - block; else ra_max = nblocks - block; ra_max = min(ra_max, ARRAY_SIZE(bh_use)); retval = ext4_bread_batch(dir, block, ra_max, false /* wait */, bh_use); if (retval) { ret = ERR_PTR(retval); ra_max = 0; goto cleanup_and_exit; } } if ((bh = bh_use[ra_ptr++]) == NULL) goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_ERR(dir, EIO, "reading directory lblock %lu", (unsigned long) block); brelse(bh); ret = ERR_PTR(-EIO); goto cleanup_and_exit; } if (!buffer_verified(bh) && !is_dx_internal_node(dir, block, (struct ext4_dir_entry *)bh->b_data) && !ext4_dirblock_csum_verify(dir, bh)) { EXT4_ERROR_INODE_ERR(dir, EFSBADCRC, "checksumming directory " "block %lu", (unsigned long)block); brelse(bh); ret = ERR_PTR(-EFSBADCRC); goto cleanup_and_exit; } set_buffer_verified(bh); i = search_dirblock(bh, dir, fname, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; ret = bh; goto cleanup_and_exit; } else { brelse(bh); if (i < 0) { ret = ERR_PTR(i); goto cleanup_and_exit; } } next: if (++block >= nblocks) block = 0; } while (block != start); /* * If the directory has grown while we were searching, then * search the last part of the directory before giving up. */ block = nblocks; nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); if (block < nblocks) { start = 0; goto restart; } cleanup_and_exit: /* Clean up the read-ahead blocks */ for (; ra_ptr < ra_max; ra_ptr++) brelse(bh_use[ra_ptr]); return ret; } static struct buffer_head *ext4_find_entry(struct inode *dir, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *inlined) { int err; struct ext4_filename fname; struct buffer_head *bh; err = ext4_fname_setup_filename(dir, d_name, 1, &fname); if (err == -ENOENT) return NULL; if (err) return ERR_PTR(err); bh = __ext4_find_entry(dir, &fname, res_dir, inlined); ext4_fname_free_filename(&fname); return bh; } static struct buffer_head *ext4_lookup_entry(struct inode *dir, struct dentry *dentry, struct ext4_dir_entry_2 **res_dir) { int err; struct ext4_filename fname; struct buffer_head *bh; err = ext4_fname_prepare_lookup(dir, dentry, &fname); if (err == -ENOENT) return NULL; if (err) return ERR_PTR(err); bh = __ext4_find_entry(dir, &fname, res_dir, NULL); ext4_fname_free_filename(&fname); return bh; } static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir) { struct super_block * sb = dir->i_sb; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct buffer_head *bh; ext4_lblk_t block; int retval; #ifdef CONFIG_FS_ENCRYPTION *res_dir = NULL; #endif frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return ERR_CAST(frame); do { block = dx_get_block(frame->at); bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); if (IS_ERR(bh)) goto errout; retval = search_dirblock(bh, dir, fname, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (retval == 1) goto success; brelse(bh); if (retval < 0) { bh = ERR_PTR(ERR_BAD_DX_DIR); goto errout; } /* Check to see if we should continue to search */ retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, frames, NULL); if (retval < 0) { ext4_warning_inode(dir, "error %d reading directory index block", retval); bh = ERR_PTR(retval); goto errout; } } while (retval == 1); bh = NULL; errout: dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name)); success: dx_release(frames); return bh; } static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; struct ext4_dir_entry_2 *de; struct buffer_head *bh; if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); bh = ext4_lookup_entry(dir, dentry, &de); if (IS_ERR(bh)) return ERR_CAST(bh); inode = NULL; if (bh) { __u32 ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); return ERR_PTR(-EFSCORRUPTED); } if (unlikely(ino == dir->i_ino)) { EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir", dentry); return ERR_PTR(-EFSCORRUPTED); } inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", ino); return ERR_PTR(-EFSCORRUPTED); } if (!IS_ERR(inode) && IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu", dir->i_ino, inode->i_ino); iput(inode); return ERR_PTR(-EPERM); } } if (IS_ENABLED(CONFIG_UNICODE) && !inode && IS_CASEFOLDED(dir)) { /* Eventually we want to call d_add_ci(dentry, NULL) * for negative dentries in the encoding case as * well. For now, prevent the negative dentry * from being cached. */ return NULL; } return d_splice_alias(inode, dentry); } struct dentry *ext4_get_parent(struct dentry *child) { __u32 ino; struct ext4_dir_entry_2 * de; struct buffer_head *bh; bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL); if (IS_ERR(bh)) return ERR_CAST(bh); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(child->d_sb, ino)) { EXT4_ERROR_INODE(d_inode(child), "bad parent inode number: %u", ino); return ERR_PTR(-EFSCORRUPTED); } return d_obtain_alias(ext4_iget(child->d_sb, ino, EXT4_IGET_NORMAL)); } /* * Move count entries from end of map between two memory locations. * Returns pointer to last entry moved. */ static struct ext4_dir_entry_2 * dx_move_dirents(struct inode *dir, char *from, char *to, struct dx_map_entry *map, int count, unsigned blocksize) { unsigned rec_len = 0; while (count--) { struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + (map->offs<<2)); rec_len = ext4_dir_rec_len(de->name_len, dir); memcpy (to, de, rec_len); ((struct ext4_dir_entry_2 *) to)->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); /* wipe dir_entry excluding the rec_len field */ de->inode = 0; memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len, blocksize) - offsetof(struct ext4_dir_entry_2, name_len)); map++; to += rec_len; } return (struct ext4_dir_entry_2 *) (to - rec_len); } /* * Compact each dir entry in the range to the minimal rec_len. * Returns pointer to last entry in range. */ static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, unsigned int blocksize) { struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; unsigned rec_len = 0; prev = to = de; while ((char*)de < base + blocksize) { next = ext4_next_entry(de, blocksize); if (de->inode && de->name_len) { rec_len = ext4_dir_rec_len(de->name_len, dir); if (de > to) memmove(to, de, rec_len); to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); prev = to; to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); } de = next; } return prev; } /* * Split a full leaf block to make room for a new dir entry. * Allocate a new block, and move entries so that they are approx. equally full. * Returns pointer to de in block into which the new entry will be inserted. */ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, struct buffer_head **bh,struct dx_frame *frame, struct dx_hash_info *hinfo) { unsigned blocksize = dir->i_sb->s_blocksize; unsigned continued; int count; struct buffer_head *bh2; ext4_lblk_t newblock; u32 hash2; struct dx_map_entry *map; char *data1 = (*bh)->b_data, *data2; unsigned split, move, size; struct ext4_dir_entry_2 *de = NULL, *de2; int csum_size = 0; int err = 0, i; if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { brelse(*bh); *bh = NULL; return ERR_CAST(bh2); } BUFFER_TRACE(*bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, *bh, EXT4_JTR_NONE); if (err) goto journal_error; BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, frame->bh, EXT4_JTR_NONE); if (err) goto journal_error; data2 = bh2->b_data; /* create map in the end of data2 block */ map = (struct dx_map_entry *) (data2 + blocksize); count = dx_make_map(dir, *bh, hinfo, map); if (count < 0) { err = count; goto journal_error; } map -= count; dx_sort_map(map, count); /* Ensure that neither split block is over half full */ size = 0; move = 0; for (i = count-1; i >= 0; i--) { /* is more than half of this entry in 2nd half of the block? */ if (size + map[i].size/2 > blocksize/2) break; size += map[i].size; move++; } /* * map index at which we will split * * If the sum of active entries didn't exceed half the block size, just * split it in half by count; each resulting block will have at least * half the space free. */ if (i > 0) split = count - move; else split = count/2; if (WARN_ON_ONCE(split == 0)) { /* Should never happen, but avoid out-of-bounds access below */ ext4_error_inode_block(dir, (*bh)->b_blocknr, 0, "bad indexed directory? hash=%08x:%08x count=%d move=%u", hinfo->hash, hinfo->minor_hash, count, move); err = -EFSCORRUPTED; goto out; } hash2 = map[split].hash; continued = hash2 == map[split - 1].hash; dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", (unsigned long)dx_get_block(frame->at), hash2, split, count-split)); /* Fancy dance to stay within two buffers */ de2 = dx_move_dirents(dir, data1, data2, map + split, count - split, blocksize); de = dx_pack_dirents(dir, data1, blocksize); de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - (char *) de, blocksize); de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) - (char *) de2, blocksize); if (csum_size) { ext4_initialize_dirent_tail(*bh, blocksize); ext4_initialize_dirent_tail(bh2, blocksize); } dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); /* Which block gets the new entry? */ if (hinfo->hash >= hash2) { swap(*bh, bh2); de = de2; } dx_insert_block(frame, hash2 + continued, newblock); err = ext4_handle_dirty_dirblock(handle, dir, bh2); if (err) goto journal_error; err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; brelse(bh2); dxtrace(dx_show_index("frame", frame->entries)); return de; journal_error: ext4_std_error(dir->i_sb, err); out: brelse(*bh); brelse(bh2); *bh = NULL; return ERR_PTR(err); } int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de) { struct ext4_dir_entry_2 *de; unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir); int nlen, rlen; unsigned int offset = 0; char *top; de = buf; top = buf + buf_size - reclen; while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) return -EFSCORRUPTED; if (ext4_match(dir, fname, de)) return -EEXIST; nlen = ext4_dir_rec_len(de->name_len, dir); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if ((de->inode ? rlen - nlen : rlen) >= reclen) break; de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } if ((char *) de > top) return -ENOSPC; *dest_de = de; return 0; } void ext4_insert_dentry(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, struct ext4_filename *fname) { int nlen, rlen; nlen = ext4_dir_rec_len(de->name_len, dir); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if (de->inode) { struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size); de->rec_len = ext4_rec_len_to_disk(nlen, buf_size); de = de1; } de->file_type = EXT4_FT_UNKNOWN; de->inode = cpu_to_le32(inode->i_ino); ext4_set_de_type(inode->i_sb, de, inode->i_mode); de->name_len = fname_len(fname); memcpy(de->name, fname_name(fname), fname_len(fname)); if (ext4_hash_in_dirent(dir)) { struct dx_hash_info *hinfo = &fname->hinfo; EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo->hash); EXT4_DIRENT_HASHES(de)->minor_hash = cpu_to_le32(hinfo->minor_hash); } } /* * Add a new entry into a directory (leaf) block. If de is non-NULL, * it points to a directory entry which is guaranteed to be large * enough for new directory entry. If de is NULL, then * add_dirent_to_buf will attempt search the directory block for * space. It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. */ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, struct buffer_head *bh) { unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; int err, err2; if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { err = ext4_find_dest_de(dir, inode, bh, bh->b_data, blocksize - csum_size, fname, &de); if (err) return err; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); if (err) { ext4_std_error(dir->i_sb, err); return err; } /* By now the buffer is marked for journaling */ ext4_insert_dentry(dir, inode, de, blocksize, fname); /* * XXX shouldn't update any times until successful * completion of syscall, but too many callers depend * on this. * * XXX similarly, too many callers depend on * ext4_new_inode() setting the times, but error * recovery deletes the inode, so the worst that can * happen is that the times are slightly out of date * and/or different from the directory change time. */ inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); inode_inc_iversion(dir); err2 = ext4_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirblock(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); return err ? err : err2; } static bool ext4_check_dx_root(struct inode *dir, struct dx_root *root) { struct fake_dirent *fde; const char *error_msg; unsigned int rlen; unsigned int blocksize = dir->i_sb->s_blocksize; char *blockend = (char *)root + dir->i_sb->s_blocksize; fde = &root->dot; if (unlikely(fde->name_len != 1)) { error_msg = "invalid name_len for '.'"; goto corrupted; } if (unlikely(strncmp(root->dot_name, ".", fde->name_len))) { error_msg = "invalid name for '.'"; goto corrupted; } rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize); if (unlikely((char *)fde + rlen >= blockend)) { error_msg = "invalid rec_len for '.'"; goto corrupted; } fde = &root->dotdot; if (unlikely(fde->name_len != 2)) { error_msg = "invalid name_len for '..'"; goto corrupted; } if (unlikely(strncmp(root->dotdot_name, "..", fde->name_len))) { error_msg = "invalid name for '..'"; goto corrupted; } rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize); if (unlikely((char *)fde + rlen >= blockend)) { error_msg = "invalid rec_len for '..'"; goto corrupted; } return true; corrupted: EXT4_ERROR_INODE(dir, "Corrupt dir, %s, running e2fsck is recommended", error_msg); return false; } /* * This converts a one block unindexed directory to a 3 block indexed * directory, and adds the dentry to the indexed directory. */ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode, struct buffer_head *bh) { struct buffer_head *bh2; struct dx_root *root; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries; struct ext4_dir_entry_2 *de, *de2; char *data2, *top; unsigned len; int retval; unsigned blocksize; ext4_lblk_t block; struct fake_dirent *fde; int csum_size = 0; if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); BUFFER_TRACE(bh, "get_write_access"); retval = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); if (retval) { ext4_std_error(dir->i_sb, retval); brelse(bh); return retval; } root = (struct dx_root *) bh->b_data; if (!ext4_check_dx_root(dir, root)) { brelse(bh); return -EFSCORRUPTED; } /* The 0th block becomes the root, move the dirents out */ fde = &root->dotdot; de = (struct ext4_dir_entry_2 *)((char *)fde + ext4_rec_len_from_disk(fde->rec_len, blocksize)); len = ((char *) root) + (blocksize - csum_size) - (char *) de; /* Allocate new block for the 0th block's dirents */ bh2 = ext4_append(handle, dir, &block); if (IS_ERR(bh2)) { brelse(bh); return PTR_ERR(bh2); } ext4_set_inode_flag(dir, EXT4_INODE_INDEX); data2 = bh2->b_data; memcpy(data2, de, len); memset(de, 0, len); /* wipe old data */ de = (struct ext4_dir_entry_2 *) data2; top = data2 + len; while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) { if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len, (char *)de - data2)) { brelse(bh2); brelse(bh); return -EFSCORRUPTED; } de = de2; } de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) - (char *) de, blocksize); if (csum_size) ext4_initialize_dirent_tail(bh2, blocksize); /* Initialize the root; the dot dirents already exist */ de = (struct ext4_dir_entry_2 *) (&root->dotdot); de->rec_len = ext4_rec_len_to_disk( blocksize - ext4_dir_rec_len(2, NULL), blocksize); memset (&root->info, 0, sizeof(root->info)); root->info.info_length = sizeof(root->info); if (ext4_hash_in_dirent(dir)) root->info.hash_version = DX_HASH_SIPHASH; else root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; entries = root->entries; dx_set_block(entries, 1); dx_set_count(entries, 1); dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); /* Initialize as for dx_probe */ fname->hinfo.hash_version = root->info.hash_version; if (fname->hinfo.hash_version <= DX_HASH_TEA) fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; /* casefolded encrypted hashes are computed on fname setup */ if (!ext4_hash_in_dirent(dir)) { int err = ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo); if (err < 0) { brelse(bh2); brelse(bh); return err; } } memset(frames, 0, sizeof(frames)); frame = frames; frame->entries = entries; frame->at = entries; frame->bh = bh; retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (retval) goto out_frames; retval = ext4_handle_dirty_dirblock(handle, dir, bh2); if (retval) goto out_frames; de = do_split(handle,dir, &bh2, frame, &fname->hinfo); if (IS_ERR(de)) { retval = PTR_ERR(de); goto out_frames; } retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2); out_frames: /* * Even if the block split failed, we have to properly write * out all the changes we did so far. Otherwise we can end up * with corrupted filesystem. */ if (retval) ext4_mark_inode_dirty(handle, dir); dx_release(frames); brelse(bh2); return retval; } /* * ext4_add_entry() * * adds a file entry to the specified directory, using the same * semantics as ext4_find_entry(). It returns NULL if it failed. * * NOTE!! The inode part of 'de' is left at 0 - which means you * may not sleep between calling this and putting something into * the entry, as someone else might have used it while you slept. */ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode) { struct inode *dir = d_inode(dentry->d_parent); struct buffer_head *bh = NULL; struct ext4_dir_entry_2 *de; struct super_block *sb; struct ext4_filename fname; int retval; int dx_fallback=0; unsigned blocksize; ext4_lblk_t block, blocks; int csum_size = 0; if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; blocksize = sb->s_blocksize; if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) return -EINVAL; retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); if (retval) return retval; if (ext4_has_inline_data(dir)) { retval = ext4_try_add_inline_entry(handle, &fname, dir, inode); if (retval < 0) goto out; if (retval == 1) { retval = 0; goto out; } } if (is_dx(dir)) { retval = ext4_dx_add_entry(handle, &fname, dir, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; /* Can we just ignore htree data? */ if (ext4_has_metadata_csum(sb)) { EXT4_ERROR_INODE(dir, "Directory has corrupted htree index."); retval = -EFSCORRUPTED; goto out; } ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); dx_fallback++; retval = ext4_mark_inode_dirty(handle, dir); if (unlikely(retval)) goto out; } blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { bh = ext4_read_dirblock(dir, block, DIRENT); if (bh == NULL) { bh = ext4_bread(handle, dir, block, EXT4_GET_BLOCKS_CREATE); goto add_to_new_block; } if (IS_ERR(bh)) { retval = PTR_ERR(bh); bh = NULL; goto out; } retval = add_dirent_to_buf(handle, &fname, dir, inode, NULL, bh); if (retval != -ENOSPC) goto out; if (blocks == 1 && !dx_fallback && ext4_has_feature_dir_index(sb)) { retval = make_indexed_dir(handle, &fname, dir, inode, bh); bh = NULL; /* make_indexed_dir releases bh */ goto out; } brelse(bh); } bh = ext4_append(handle, dir, &block); add_to_new_block: if (IS_ERR(bh)) { retval = PTR_ERR(bh); bh = NULL; goto out; } de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); if (csum_size) ext4_initialize_dirent_tail(bh, blocksize); retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh); out: ext4_fname_free_filename(&fname); brelse(bh); if (retval == 0) ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); return retval; } /* * Returns 0 for success, or a negative error value */ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; struct dx_entry *entries, *at; struct buffer_head *bh; struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int restart; int err; again: restart = 0; frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return PTR_ERR(frame); entries = frame->entries; at = frame->at; bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT_HTREE); if (IS_ERR(bh)) { err = PTR_ERR(bh); bh = NULL; goto cleanup; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) goto journal_error; err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh); if (err != -ENOSPC) goto cleanup; err = 0; /* Block full, should compress but for now just split */ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", dx_get_count(entries), dx_get_limit(entries))); /* Need to split index? */ if (dx_get_count(entries) == dx_get_limit(entries)) { ext4_lblk_t newblock; int levels = frame - frames + 1; unsigned int icount; int add_level = 1; struct dx_entry *entries2; struct dx_node *node2; struct buffer_head *bh2; while (frame > frames) { if (dx_get_count((frame - 1)->entries) < dx_get_limit((frame - 1)->entries)) { add_level = 0; break; } frame--; /* split higher index block */ at = frame->at; entries = frame->entries; restart = 1; } if (add_level && levels == ext4_dir_htree_level(sb)) { ext4_warning(sb, "Directory (ino: %lu) index full, " "reach max htree level :%d", dir->i_ino, levels); if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { ext4_warning(sb, "Large directory feature is " "not enabled on this " "filesystem"); } err = -ENOSPC; goto cleanup; } icount = dx_get_count(entries); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { err = PTR_ERR(bh2); goto cleanup; } node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; memset(&node2->fake, 0, sizeof(struct fake_dirent)); node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, sb->s_blocksize); BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, frame->bh, EXT4_JTR_NONE); if (err) goto journal_error; if (!add_level) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", icount1, icount2)); BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ err = ext4_journal_get_write_access(handle, sb, (frame - 1)->bh, EXT4_JTR_NONE); if (err) goto journal_error; memcpy((char *) entries2, (char *) (entries + icount1), icount2 * sizeof(struct dx_entry)); dx_set_count(entries, icount1); dx_set_count(entries2, icount2); dx_set_limit(entries2, dx_node_limit(dir)); /* Which index block gets the new entry? */ if (at - entries >= icount1) { frame->at = at - entries - icount1 + entries2; frame->entries = entries = entries2; swap(frame->bh, bh2); } dx_insert_block((frame - 1), hash2, newblock); dxtrace(dx_show_index("node", frame->entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_handle_dirty_dx_node(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); err = ext4_handle_dirty_dx_node(handle, dir, (frame - 1)->bh); if (err) goto journal_error; err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (restart || err) goto journal_error; } else { struct dx_root *dxroot; memcpy((char *) entries2, (char *) entries, icount * sizeof(struct dx_entry)); dx_set_limit(entries2, dx_node_limit(dir)); /* Set up root */ dx_set_count(entries, 1); dx_set_block(entries + 0, newblock); dxroot = (struct dx_root *)frames[0].bh->b_data; dxroot->info.indirect_levels += 1; dxtrace(printk(KERN_DEBUG "Creating %d level index...\n", dxroot->info.indirect_levels)); err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; err = ext4_handle_dirty_dx_node(handle, dir, bh2); brelse(bh2); restart = 1; goto journal_error; } } de = do_split(handle, dir, &bh, frame, &fname->hinfo); if (IS_ERR(de)) { err = PTR_ERR(de); goto cleanup; } err = add_dirent_to_buf(handle, fname, dir, inode, de, bh); goto cleanup; journal_error: ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ cleanup: brelse(bh); dx_release(frames); /* @restart is true means htree-path has been changed, we need to * repeat dx_probe() to find out valid htree-path */ if (restart && err == 0) goto again; return err; } /* * ext4_generic_delete_entry deletes a directory entry by merging it * with the previous entry */ int ext4_generic_delete_entry(struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, void *entry_buf, int buf_size, int csum_size) { struct ext4_dir_entry_2 *de, *pde; unsigned int blocksize = dir->i_sb->s_blocksize; int i; i = 0; pde = NULL; de = entry_buf; while (i < buf_size - csum_size) { if (ext4_check_dir_entry(dir, NULL, de, bh, entry_buf, buf_size, i)) return -EFSCORRUPTED; if (de == de_del) { if (pde) { pde->rec_len = ext4_rec_len_to_disk( ext4_rec_len_from_disk(pde->rec_len, blocksize) + ext4_rec_len_from_disk(de->rec_len, blocksize), blocksize); /* wipe entire dir_entry */ memset(de, 0, ext4_rec_len_from_disk(de->rec_len, blocksize)); } else { /* wipe dir_entry excluding the rec_len field */ de->inode = 0; memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len, blocksize) - offsetof(struct ext4_dir_entry_2, name_len)); } inode_inc_iversion(dir); return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize); pde = de; de = ext4_next_entry(de, blocksize); } return -ENOENT; } static int ext4_delete_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh) { int err, csum_size = 0; if (ext4_has_inline_data(dir)) { int has_inline_data = 1; err = ext4_delete_inline_entry(handle, dir, de_del, bh, &has_inline_data); if (has_inline_data) return err; } if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); if (unlikely(err)) goto out; err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data, dir->i_sb->s_blocksize, csum_size); if (err) goto out; BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirblock(handle, dir, bh); if (unlikely(err)) goto out; return 0; out: if (err != -ENOENT) ext4_std_error(dir->i_sb, err); return err; } /* * Set directory link count to 1 if nlinks > EXT4_LINK_MAX, or if nlinks == 2 * since this indicates that nlinks count was previously 1 to avoid overflowing * the 16-bit i_links_count field on disk. Directories with i_nlink == 1 mean * that subdirectory link counts are not being maintained accurately. * * The caller has already checked for i_nlink overflow in case the DIR_LINK * feature is not enabled and returned -EMLINK. The is_dx() check is a proxy * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set * on regular files) and to avoid creating huge/slow non-HTREE directories. */ static void ext4_inc_count(struct inode *inode) { inc_nlink(inode); if (is_dx(inode) && (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2)) set_nlink(inode, 1); } /* * If a directory had nlink == 1, then we should let it be 1. This indicates * directory has >EXT4_LINK_MAX subdirs. */ static void ext4_dec_count(struct inode *inode) { if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) drop_nlink(inode); } /* * Add non-directory inode to a directory. On success, the inode reference is * consumed by dentry is instantiation. This is also indicated by clearing of * *inodep pointer. On failure, the caller is responsible for dropping the * inode reference in the safe context. */ static int ext4_add_nondir(handle_t *handle, struct dentry *dentry, struct inode **inodep) { struct inode *dir = d_inode(dentry->d_parent); struct inode *inode = *inodep; int err = ext4_add_entry(handle, dentry, inode); if (!err) { err = ext4_mark_inode_dirty(handle, inode); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); d_instantiate_new(dentry, inode); *inodep = NULL; return err; } drop_nlink(inode); ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); return err; } /* * By the time this is called, we already have created * the directory cache entry for the new file, but it * is so far negative - it has no inode. * * If the create succeeds, we fill in the inode information * with d_instantiate(). */ static int ext4_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { handle_t *handle; struct inode *inode; int err, credits, retries = 0; err = dquot_initialize(dir); if (err) return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); err = ext4_add_nondir(handle, dentry, &inode); if (!err) ext4_fc_track_create(handle, dentry); } if (handle) ext4_journal_stop(handle); if (!IS_ERR_OR_NULL(inode)) iput(inode); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; } static int ext4_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { handle_t *handle; struct inode *inode; int err, credits, retries = 0; err = dquot_initialize(dir); if (err) return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ext4_special_inode_operations; err = ext4_add_nondir(handle, dentry, &inode); if (!err) ext4_fc_track_create(handle, dentry); } if (handle) ext4_journal_stop(handle); if (!IS_ERR_OR_NULL(inode)) iput(inode); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; } static int ext4_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { handle_t *handle; struct inode *inode; int err, retries = 0; err = dquot_initialize(dir); if (err) return err; retry: inode = ext4_new_inode_start_handle(idmap, dir, mode, NULL, 0, NULL, EXT4_HT_DIR, EXT4_MAXQUOTAS_TRANS_BLOCKS(dir->i_sb) + 4 + EXT4_XATTR_TRANS_BLOCKS); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); d_tmpfile(file, inode); err = ext4_orphan_add(handle, inode); if (err) goto err_unlock_inode; mark_inode_dirty(inode); unlock_new_inode(inode); } if (handle) ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return finish_open_simple(file, err); err_unlock_inode: ext4_journal_stop(handle); unlock_new_inode(inode); return err; } struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, struct ext4_dir_entry_2 *de, int blocksize, int csum_size, unsigned int parent_ino, int dotdot_real_len) { de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), blocksize); strcpy(de->name, "."); ext4_set_de_type(inode->i_sb, de, S_IFDIR); de = ext4_next_entry(de, blocksize); de->inode = cpu_to_le32(parent_ino); de->name_len = 2; if (!dotdot_real_len) de->rec_len = ext4_rec_len_to_disk(blocksize - (csum_size + ext4_dir_rec_len(1, NULL)), blocksize); else de->rec_len = ext4_rec_len_to_disk( ext4_dir_rec_len(de->name_len, NULL), blocksize); strcpy(de->name, ".."); ext4_set_de_type(inode->i_sb, de, S_IFDIR); return ext4_next_entry(de, blocksize); } int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct inode *inode) { struct buffer_head *dir_block = NULL; struct ext4_dir_entry_2 *de; ext4_lblk_t block = 0; unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; int err; if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { err = ext4_try_create_inline_dir(handle, dir, inode); if (err < 0 && err != -ENOSPC) goto out; if (!err) goto out; } inode->i_size = 0; dir_block = ext4_append(handle, inode, &block); if (IS_ERR(dir_block)) return PTR_ERR(dir_block); de = (struct ext4_dir_entry_2 *)dir_block->b_data; ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); set_nlink(inode, 2); if (csum_size) ext4_initialize_dirent_tail(dir_block, blocksize); BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirblock(handle, inode, dir_block); if (err) goto out; set_buffer_verified(dir_block); out: brelse(dir_block); return err; } static int ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; int err, err2 = 0, credits, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; err = dquot_initialize(dir); if (err) return err; credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: inode = ext4_new_inode_start_handle(idmap, dir, S_IFDIR | mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; err = ext4_init_new_dir(handle, dir, inode); if (err) goto out_clear_inode; err = ext4_mark_inode_dirty(handle, inode); if (!err) err = ext4_add_entry(handle, dentry, inode); if (err) { out_clear_inode: clear_nlink(inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2)) err = err2; ext4_journal_stop(handle); iput(inode); goto out_retry; } ext4_inc_count(dir); ext4_update_dx_flag(dir); err = ext4_mark_inode_dirty(handle, dir); if (err) goto out_clear_inode; d_instantiate_new(dentry, inode); ext4_fc_track_create(handle, dentry); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); out_stop: if (handle) ext4_journal_stop(handle); out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; } /* * routine to check that the specified directory is empty (for rmdir) */ bool ext4_empty_dir(struct inode *inode) { unsigned int offset; struct buffer_head *bh; struct ext4_dir_entry_2 *de; struct super_block *sb; if (ext4_has_inline_data(inode)) { int has_inline_data = 1; int ret; ret = empty_inline_dir(inode, &has_inline_data); if (has_inline_data) return ret; } sb = inode->i_sb; if (inode->i_size < ext4_dir_rec_len(1, NULL) + ext4_dir_rec_len(2, NULL)) { EXT4_ERROR_INODE(inode, "invalid size"); return false; } bh = ext4_read_dirblock(inode, 0, EITHER); if (IS_ERR(bh)) return false; de = (struct ext4_dir_entry_2 *) bh->b_data; if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, 0) || le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { ext4_warning_inode(inode, "directory missing '.'"); brelse(bh); return false; } offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); de = ext4_next_entry(de, sb->s_blocksize); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { ext4_warning_inode(inode, "directory missing '..'"); brelse(bh); return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); while (offset < inode->i_size) { if (!(offset & (sb->s_blocksize - 1))) { unsigned int lblock; brelse(bh); lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); bh = ext4_read_dirblock(inode, lblock, EITHER); if (bh == NULL) { offset += sb->s_blocksize; continue; } if (IS_ERR(bh)) return false; } de = (struct ext4_dir_entry_2 *) (bh->b_data + (offset & (sb->s_blocksize - 1))); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || le32_to_cpu(de->inode)) { brelse(bh); return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } brelse(bh); return true; } static int ext4_rmdir(struct inode *dir, struct dentry *dentry) { int retval; struct inode *inode; struct buffer_head *bh; struct ext4_dir_entry_2 *de; handle_t *handle = NULL; if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; /* Initialize quotas before so that eventual writes go in * separate transaction */ retval = dquot_initialize(dir); if (retval) return retval; retval = dquot_initialize(d_inode(dentry)); if (retval) return retval; retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) goto end_rmdir; inode = d_inode(dentry); retval = -EFSCORRUPTED; if (le32_to_cpu(de->inode) != inode->i_ino) goto end_rmdir; retval = -ENOTEMPTY; if (!ext4_empty_dir(inode)) goto end_rmdir; handle = ext4_journal_start(dir, EXT4_HT_DIR, EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); handle = NULL; goto end_rmdir; } if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_rmdir; if (!EXT4_DIR_LINK_EMPTY(inode)) ext4_warning_inode(inode, "empty directory '%.*s' has too many links (%u)", dentry->d_name.len, dentry->d_name.name, inode->i_nlink); inode_inc_iversion(inode); clear_nlink(inode); /* There's no need to set i_disksize: the fact that i_nlink is * zero will ensure that the right thing happens during any * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (retval) goto end_rmdir; ext4_dec_count(dir); ext4_update_dx_flag(dir); ext4_fc_track_unlink(handle, dentry); retval = ext4_mark_inode_dirty(handle, dir); /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the * negative dentries at ext4_lookup(), when it is better * supported by the VFS for the CI case. */ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); end_rmdir: brelse(bh); if (handle) ext4_journal_stop(handle); return retval; } int __ext4_unlink(struct inode *dir, const struct qstr *d_name, struct inode *inode, struct dentry *dentry /* NULL during fast_commit recovery */) { int retval = -ENOENT; struct buffer_head *bh; struct ext4_dir_entry_2 *de; handle_t *handle; int skip_remove_dentry = 0; /* * Keep this outside the transaction; it may have to set up the * directory's encryption key, which isn't GFP_NOFS-safe. */ bh = ext4_find_entry(dir, d_name, &de, NULL); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) return -ENOENT; if (le32_to_cpu(de->inode) != inode->i_ino) { /* * It's okay if we find dont find dentry which matches * the inode. That's because it might have gotten * renamed to a different inode number */ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) skip_remove_dentry = 1; else goto out_bh; } handle = ext4_journal_start(dir, EXT4_HT_DIR, EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); goto out_bh; } if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); if (!skip_remove_dentry) { retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto out_handle; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) goto out_handle; } else { retval = 0; } if (inode->i_nlink == 0) ext4_warning_inode(inode, "Deleting file '%.*s' with no links", d_name->len, d_name->name); else drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (dentry && !retval) ext4_fc_track_unlink(handle, dentry); out_handle: ext4_journal_stop(handle); out_bh: brelse(bh); return retval; } static int ext4_unlink(struct inode *dir, struct dentry *dentry) { int retval; if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; trace_ext4_unlink_enter(dir, dentry); /* * Initialize quotas before so that eventual writes go * in separate transaction */ retval = dquot_initialize(dir); if (retval) goto out_trace; retval = dquot_initialize(d_inode(dentry)); if (retval) goto out_trace; retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry); /* VFS negative dentries are incompatible with Encoding and * Case-insensitiveness. Eventually we'll want avoid * invalidating the dentries here, alongside with returning the * negative dentries at ext4_lookup(), when it is better * supported by the VFS for the CI case. */ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); out_trace: trace_ext4_unlink_exit(dentry, retval); return retval; } static int ext4_init_symlink_block(handle_t *handle, struct inode *inode, struct fscrypt_str *disk_link) { struct buffer_head *bh; char *kaddr; int err = 0; bh = ext4_bread(handle, inode, 0, EXT4_GET_BLOCKS_CREATE); if (IS_ERR(bh)) return PTR_ERR(bh); BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto out; kaddr = (char *)bh->b_data; memcpy(kaddr, disk_link->name, disk_link->len); inode->i_size = disk_link->len - 1; EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_handle_dirty_metadata(handle, inode, bh); out: brelse(bh); return err; } static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { handle_t *handle; struct inode *inode; int err, len = strlen(symname); int credits; struct fscrypt_str disk_link; int retries = 0; if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, &disk_link); if (err) return err; err = dquot_initialize(dir); if (err) return err; /* * EXT4_INDEX_EXTRA_TRANS_BLOCKS for addition of entry into the * directory. +3 for inode, inode bitmap, group descriptor allocation. * EXT4_DATA_TRANS_BLOCKS for the data block allocation and * modification. */ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; retry: inode = ext4_new_inode_start_handle(idmap, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); if (IS_ERR(inode)) { if (handle) ext4_journal_stop(handle); err = PTR_ERR(inode); goto out_retry; } if (IS_ENCRYPTED(inode)) { err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); if (err) goto err_drop_inode; inode->i_op = &ext4_encrypted_symlink_inode_operations; } else { if ((disk_link.len > EXT4_N_BLOCKS * 4)) { inode->i_op = &ext4_symlink_inode_operations; } else { inode->i_op = &ext4_fast_symlink_inode_operations; } } if ((disk_link.len > EXT4_N_BLOCKS * 4)) { /* alloc symlink block and fill it */ err = ext4_init_symlink_block(handle, inode, &disk_link); if (err) goto err_drop_inode; } else { /* clear the extent format for fast symlink */ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name, disk_link.len); inode->i_size = disk_link.len - 1; EXT4_I(inode)->i_disksize = inode->i_size; if (!IS_ENCRYPTED(inode)) inode_set_cached_link(inode, (char *)&EXT4_I(inode)->i_data, inode->i_size); } err = ext4_add_nondir(handle, dentry, &inode); if (handle) ext4_journal_stop(handle); iput(inode); goto out_retry; err_drop_inode: clear_nlink(inode); ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); if (handle) ext4_journal_stop(handle); iput(inode); out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; if (disk_link.name != (unsigned char *)symname) kfree(disk_link.name); return err; } int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry) { handle_t *handle; int err, retries = 0; retry: handle = ext4_journal_start(dir, EXT4_HT_DIR, (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1); if (IS_ERR(handle)) return PTR_ERR(handle); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); inode_set_ctime_current(inode); ext4_inc_count(inode); ihold(inode); err = ext4_add_entry(handle, dentry, inode); if (!err) { err = ext4_mark_inode_dirty(handle, inode); /* this can happen only for tmpfile being * linked the first time */ if (inode->i_nlink == 1) ext4_orphan_del(handle, inode); d_instantiate(dentry, inode); ext4_fc_track_link(handle, dentry); } else { drop_nlink(inode); iput(inode); } ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; } static int ext4_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); int err; if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; err = fscrypt_prepare_link(old_dentry, dir, dentry); if (err) return err; if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && (!projid_eq(EXT4_I(dir)->i_projid, EXT4_I(old_dentry->d_inode)->i_projid))) return -EXDEV; err = dquot_initialize(dir); if (err) return err; return __ext4_link(dir, inode, dentry); } /* * Try to find buffer head where contains the parent block. * It should be the inode block if it is inlined or the 1st block * if it is a normal dir. */ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, struct inode *inode, int *retval, struct ext4_dir_entry_2 **parent_de, int *inlined) { struct buffer_head *bh; if (!ext4_has_inline_data(inode)) { struct ext4_dir_entry_2 *de; unsigned int offset; bh = ext4_read_dirblock(inode, 0, EITHER); if (IS_ERR(bh)) { *retval = PTR_ERR(bh); return NULL; } de = (struct ext4_dir_entry_2 *) bh->b_data; if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, 0) || le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { EXT4_ERROR_INODE(inode, "directory missing '.'"); brelse(bh); *retval = -EFSCORRUPTED; return NULL; } offset = ext4_rec_len_from_disk(de->rec_len, inode->i_sb->s_blocksize); de = ext4_next_entry(de, inode->i_sb->s_blocksize); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset) || le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { EXT4_ERROR_INODE(inode, "directory missing '..'"); brelse(bh); *retval = -EFSCORRUPTED; return NULL; } *parent_de = de; return bh; } *inlined = 1; return ext4_get_first_inline_block(inode, parent_de, retval); } struct ext4_renament { struct inode *dir; struct dentry *dentry; struct inode *inode; bool is_dir; int dir_nlink_delta; /* entry for "dentry" */ struct buffer_head *bh; struct ext4_dir_entry_2 *de; int inlined; /* entry for ".." in inode if it's a directory */ struct buffer_head *dir_bh; struct ext4_dir_entry_2 *parent_de; int dir_inlined; }; static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent, bool is_cross) { int retval; ent->is_dir = true; if (!is_cross) return 0; ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode, &retval, &ent->parent_de, &ent->dir_inlined); if (!ent->dir_bh) return retval; if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino) return -EFSCORRUPTED; BUFFER_TRACE(ent->dir_bh, "get_write_access"); return ext4_journal_get_write_access(handle, ent->dir->i_sb, ent->dir_bh, EXT4_JTR_NONE); } static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, unsigned dir_ino) { int retval; if (!ent->dir_bh) return 0; ent->parent_de->inode = cpu_to_le32(dir_ino); BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata"); if (!ent->dir_inlined) { if (is_dx(ent->inode)) { retval = ext4_handle_dirty_dx_node(handle, ent->inode, ent->dir_bh); } else { retval = ext4_handle_dirty_dirblock(handle, ent->inode, ent->dir_bh); } } else { retval = ext4_mark_inode_dirty(handle, ent->inode); } if (retval) { ext4_std_error(ent->dir->i_sb, retval); return retval; } return 0; } static int ext4_setent(handle_t *handle, struct ext4_renament *ent, unsigned ino, unsigned file_type) { int retval, retval2; BUFFER_TRACE(ent->bh, "get write access"); retval = ext4_journal_get_write_access(handle, ent->dir->i_sb, ent->bh, EXT4_JTR_NONE); if (retval) return retval; ent->de->inode = cpu_to_le32(ino); if (ext4_has_feature_filetype(ent->dir->i_sb)) ent->de->file_type = file_type; inode_inc_iversion(ent->dir); inode_set_mtime_to_ts(ent->dir, inode_set_ctime_current(ent->dir)); retval = ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { retval2 = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh); if (unlikely(retval2)) { ext4_std_error(ent->dir->i_sb, retval2); return retval2; } } return retval; } static void ext4_resetent(handle_t *handle, struct ext4_renament *ent, unsigned ino, unsigned file_type) { struct ext4_renament old = *ent; int retval = 0; /* * old->de could have moved from under us during make indexed dir, * so the old->de may no longer valid and need to find it again * before reset old inode info. */ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); if (IS_ERR(old.bh)) retval = PTR_ERR(old.bh); if (!old.bh) retval = -ENOENT; if (retval) { ext4_std_error(old.dir->i_sb, retval); return; } ext4_setent(handle, &old, ino, file_type); brelse(old.bh); } static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, const struct qstr *d_name) { int retval = -ENOENT; struct buffer_head *bh; struct ext4_dir_entry_2 *de; bh = ext4_find_entry(dir, d_name, &de, NULL); if (IS_ERR(bh)) return PTR_ERR(bh); if (bh) { retval = ext4_delete_entry(handle, dir, de, bh); brelse(bh); } return retval; } static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent, int force_reread) { int retval; /* * ent->de could have moved from under us during htree split, so make * sure that we are deleting the right entry. We might also be pointing * to a stale entry in the unused part of ent->bh so just checking inum * and the name isn't enough. */ if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino || ent->de->name_len != ent->dentry->d_name.len || strncmp(ent->de->name, ent->dentry->d_name.name, ent->de->name_len) || force_reread) { retval = ext4_find_delete_entry(handle, ent->dir, &ent->dentry->d_name); } else { retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh); if (retval == -ENOENT) { retval = ext4_find_delete_entry(handle, ent->dir, &ent->dentry->d_name); } } if (retval) { ext4_warning_inode(ent->dir, "Deleting old file: nlink %d, error=%d", ent->dir->i_nlink, retval); } } static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) { if (ent->dir_nlink_delta) { if (ent->dir_nlink_delta == -1) ext4_dec_count(ent->dir); else ext4_inc_count(ent->dir); ext4_mark_inode_dirty(handle, ent->dir); } } static struct inode *ext4_whiteout_for_rename(struct mnt_idmap *idmap, struct ext4_renament *ent, int credits, handle_t **h) { struct inode *wh; handle_t *handle; int retries = 0; /* * for inode block, sb block, group summaries, * and inode bitmap */ credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) + EXT4_XATTR_TRANS_BLOCKS + 4); retry: wh = ext4_new_inode_start_handle(idmap, ent->dir, S_IFCHR | WHITEOUT_MODE, &ent->dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); if (IS_ERR(wh)) { if (handle) ext4_journal_stop(handle); if (PTR_ERR(wh) == -ENOSPC && ext4_should_retry_alloc(ent->dir->i_sb, &retries)) goto retry; } else { *h = handle; init_special_inode(wh, wh->i_mode, WHITEOUT_DEV); wh->i_op = &ext4_special_inode_operations; } return wh; } /* * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. * * n.b. old_{dentry,inode) refers to the source dentry/inode * while new_{dentry,inode) refers to the destination dentry/inode * This comes from rename(const char *oldpath, const char *newpath) */ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { handle_t *handle = NULL; struct ext4_renament old = { .dir = old_dir, .dentry = old_dentry, .inode = d_inode(old_dentry), }; struct ext4_renament new = { .dir = new_dir, .dentry = new_dentry, .inode = d_inode(new_dentry), }; int force_reread; int retval; struct inode *whiteout = NULL; int credits; u8 old_file_type; if (new.inode && new.inode->i_nlink == 0) { EXT4_ERROR_INODE(new.inode, "target of rename is already freed"); return -EFSCORRUPTED; } if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) && (!projid_eq(EXT4_I(new_dir)->i_projid, EXT4_I(old_dentry->d_inode)->i_projid))) return -EXDEV; retval = dquot_initialize(old.dir); if (retval) return retval; retval = dquot_initialize(old.inode); if (retval) return retval; retval = dquot_initialize(new.dir); if (retval) return retval; /* Initialize quotas before so that eventual writes go * in separate transaction */ if (new.inode) { retval = dquot_initialize(new.inode); if (retval) return retval; } old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); if (IS_ERR(old.bh)) return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process * and merrily kill the link to whatever was created under the * same name. Goodbye sticky bit ;-< */ retval = -ENOENT; if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) goto release_bh; new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); if (IS_ERR(new.bh)) { retval = PTR_ERR(new.bh); new.bh = NULL; goto release_bh; } if (new.bh) { if (!new.inode) { brelse(new.bh); new.bh = NULL; } } if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC)) ext4_alloc_da_blocks(old.inode); credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); if (!(flags & RENAME_WHITEOUT)) { handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits); if (IS_ERR(handle)) { retval = PTR_ERR(handle); goto release_bh; } } else { whiteout = ext4_whiteout_for_rename(idmap, &old, credits, &handle); if (IS_ERR(whiteout)) { retval = PTR_ERR(whiteout); goto release_bh; } } old_file_type = old.de->file_type; if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) ext4_handle_sync(handle); if (S_ISDIR(old.inode->i_mode)) { if (new.inode) { retval = -ENOTEMPTY; if (!ext4_empty_dir(new.inode)) goto end_rename; } else { retval = -EMLINK; if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) goto end_rename; } retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir); if (retval) goto end_rename; } /* * If we're renaming a file within an inline_data dir and adding or * setting the new dirent causes a conversion from inline_data to * extents/blockmap, we need to force the dirent delete code to * re-read the directory, or else we end up trying to delete a dirent * from what is now the extent tree root (or a block map). */ force_reread = (new.dir->i_ino == old.dir->i_ino && ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA)); if (whiteout) { /* * Do this before adding a new entry, so the old entry is sure * to be still pointing to the valid old entry. */ retval = ext4_setent(handle, &old, whiteout->i_ino, EXT4_FT_CHRDEV); if (retval) goto end_rename; retval = ext4_mark_inode_dirty(handle, whiteout); if (unlikely(retval)) goto end_rename; } if (!new.bh) { retval = ext4_add_entry(handle, new.dentry, old.inode); if (retval) goto end_rename; } else { retval = ext4_setent(handle, &new, old.inode->i_ino, old_file_type); if (retval) goto end_rename; } if (force_reread) force_reread = !ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA); /* * Like most other Unix systems, set the ctime for inodes on a * rename. */ inode_set_ctime_current(old.inode); retval = ext4_mark_inode_dirty(handle, old.inode); if (unlikely(retval)) goto end_rename; if (!whiteout) { /* * ok, that's it */ ext4_rename_delete(handle, &old, force_reread); } if (new.inode) { ext4_dec_count(new.inode); inode_set_ctime_current(new.inode); } inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir)); ext4_update_dx_flag(old.dir); if (old.is_dir) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) goto end_rename; ext4_dec_count(old.dir); if (new.inode) { /* checked ext4_empty_dir above, can't have another * parent, ext4_dec_count() won't work for many-linked * dirs */ clear_nlink(new.inode); } else { ext4_inc_count(new.dir); ext4_update_dx_flag(new.dir); retval = ext4_mark_inode_dirty(handle, new.dir); if (unlikely(retval)) goto end_rename; } } retval = ext4_mark_inode_dirty(handle, old.dir); if (unlikely(retval)) goto end_rename; if (old.is_dir) { /* * We disable fast commits here that's because the * replay code is not yet capable of changing dot dot * dirents in directories. */ ext4_fc_mark_ineligible(old.inode->i_sb, EXT4_FC_REASON_RENAME_DIR, handle); } else { struct super_block *sb = old.inode->i_sb; if (new.inode) ext4_fc_track_unlink(handle, new.dentry); if (test_opt2(sb, JOURNAL_FAST_COMMIT) && !(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE))) { __ext4_fc_track_link(handle, old.inode, new.dentry); __ext4_fc_track_unlink(handle, old.inode, old.dentry); if (whiteout) __ext4_fc_track_create(handle, whiteout, old.dentry); } } if (new.inode) { retval = ext4_mark_inode_dirty(handle, new.inode); if (unlikely(retval)) goto end_rename; if (!new.inode->i_nlink) ext4_orphan_add(handle, new.inode); } retval = 0; end_rename: if (whiteout) { if (retval) { ext4_resetent(handle, &old, old.inode->i_ino, old_file_type); drop_nlink(whiteout); ext4_mark_inode_dirty(handle, whiteout); ext4_orphan_add(handle, whiteout); } unlock_new_inode(whiteout); ext4_journal_stop(handle); iput(whiteout); } else { ext4_journal_stop(handle); } release_bh: brelse(old.dir_bh); brelse(old.bh); brelse(new.bh); return retval; } static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { handle_t *handle = NULL; struct ext4_renament old = { .dir = old_dir, .dentry = old_dentry, .inode = d_inode(old_dentry), }; struct ext4_renament new = { .dir = new_dir, .dentry = new_dentry, .inode = d_inode(new_dentry), }; u8 new_file_type; int retval; if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && !projid_eq(EXT4_I(new_dir)->i_projid, EXT4_I(old_dentry->d_inode)->i_projid)) || (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) && !projid_eq(EXT4_I(old_dir)->i_projid, EXT4_I(new_dentry->d_inode)->i_projid))) return -EXDEV; retval = dquot_initialize(old.dir); if (retval) return retval; retval = dquot_initialize(new.dir); if (retval) return retval; old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); if (IS_ERR(old.bh)) return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process * and merrily kill the link to whatever was created under the * same name. Goodbye sticky bit ;-< */ retval = -ENOENT; if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) goto end_rename; new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); if (IS_ERR(new.bh)) { retval = PTR_ERR(new.bh); new.bh = NULL; goto end_rename; } /* RENAME_EXCHANGE case: old *and* new must both exist */ if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino) goto end_rename; handle = ext4_journal_start(old.dir, EXT4_HT_DIR, (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); handle = NULL; goto end_rename; } if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) ext4_handle_sync(handle); if (S_ISDIR(old.inode->i_mode)) { retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir); if (retval) goto end_rename; } if (S_ISDIR(new.inode->i_mode)) { retval = ext4_rename_dir_prepare(handle, &new, new.dir != old.dir); if (retval) goto end_rename; } /* * Other than the special case of overwriting a directory, parents' * nlink only needs to be modified if this is a cross directory rename. */ if (old.dir != new.dir && old.is_dir != new.is_dir) { old.dir_nlink_delta = old.is_dir ? -1 : 1; new.dir_nlink_delta = -old.dir_nlink_delta; retval = -EMLINK; if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) || (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir))) goto end_rename; } new_file_type = new.de->file_type; retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type); if (retval) goto end_rename; retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type); if (retval) goto end_rename; /* * Like most other Unix systems, set the ctime for inodes on a * rename. */ inode_set_ctime_current(old.inode); inode_set_ctime_current(new.inode); retval = ext4_mark_inode_dirty(handle, old.inode); if (unlikely(retval)) goto end_rename; retval = ext4_mark_inode_dirty(handle, new.inode); if (unlikely(retval)) goto end_rename; ext4_fc_mark_ineligible(new.inode->i_sb, EXT4_FC_REASON_CROSS_RENAME, handle); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) goto end_rename; } if (new.dir_bh) { retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino); if (retval) goto end_rename; } ext4_update_dir_count(handle, &old); ext4_update_dir_count(handle, &new); retval = 0; end_rename: brelse(old.dir_bh); brelse(new.dir_bh); brelse(old.bh); brelse(new.bh); if (handle) ext4_journal_stop(handle); return retval; } static int ext4_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { int err; if (unlikely(ext4_forced_shutdown(old_dir->i_sb))) return -EIO; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, flags); if (err) return err; if (flags & RENAME_EXCHANGE) { return ext4_cross_rename(old_dir, old_dentry, new_dir, new_dentry); } return ext4_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); } /* * directories can handle most operations... */ const struct inode_operations ext4_dir_inode_operations = { .create = ext4_create, .lookup = ext4_lookup, .link = ext4_link, .unlink = ext4_unlink, .symlink = ext4_symlink, .mkdir = ext4_mkdir, .rmdir = ext4_rmdir, .mknod = ext4_mknod, .tmpfile = ext4_tmpfile, .rename = ext4_rename2, .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, .fileattr_get = ext4_fileattr_get, .fileattr_set = ext4_fileattr_set, }; const struct inode_operations ext4_special_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, }; |
| 121 5 5 5 6 6 6 106 1 1 1 5 30 24 1 1 9 1 8 1 6 3 5 1 6 7 107 106 108 4 4 4 6 6 6 6 6 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 | // SPDX-License-Identifier: GPL-2.0 #include <linux/compat.h> #include <linux/console.h> #include <linux/fb.h> #include <linux/fbcon.h> #include <linux/major.h> #include "fb_internal.h" /* * We hold a reference to the fb_info in file->private_data, * but if the current registered fb has changed, we don't * actually want to use it. * * So look up the fb_info using the inode minor number, * and just verify it against the reference we have. */ static struct fb_info *file_fb_info(struct file *file) { struct inode *inode = file_inode(file); int fbidx = iminor(inode); struct fb_info *info = registered_fb[fbidx]; if (info != file->private_data) info = NULL; return info; } static ssize_t fb_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct fb_info *info = file_fb_info(file); if (!info) return -ENODEV; if (fb_WARN_ON_ONCE(info, !info->fbops->fb_read)) return -EINVAL; if (info->state != FBINFO_STATE_RUNNING) return -EPERM; return info->fbops->fb_read(info, buf, count, ppos); } static ssize_t fb_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct fb_info *info = file_fb_info(file); if (!info) return -ENODEV; if (fb_WARN_ON_ONCE(info, !info->fbops->fb_write)) return -EINVAL; if (info->state != FBINFO_STATE_RUNNING) return -EPERM; return info->fbops->fb_write(info, buf, count, ppos); } static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg) { const struct fb_ops *fb; struct fb_var_screeninfo var; struct fb_fix_screeninfo fix; struct fb_cmap cmap_from; struct fb_cmap_user cmap; void __user *argp = (void __user *)arg; long ret = 0; switch (cmd) { case FBIOGET_VSCREENINFO: lock_fb_info(info); var = info->var; unlock_fb_info(info); ret = copy_to_user(argp, &var, sizeof(var)) ? -EFAULT : 0; break; case FBIOPUT_VSCREENINFO: if (copy_from_user(&var, argp, sizeof(var))) return -EFAULT; /* only for kernel-internal use */ var.activate &= ~FB_ACTIVATE_KD_TEXT; console_lock(); lock_fb_info(info); ret = fbcon_modechange_possible(info, &var); if (!ret) ret = fb_set_var(info, &var); if (!ret) fbcon_update_vcs(info, var.activate & FB_ACTIVATE_ALL); unlock_fb_info(info); console_unlock(); if (!ret && copy_to_user(argp, &var, sizeof(var))) ret = -EFAULT; break; case FBIOGET_FSCREENINFO: lock_fb_info(info); memcpy(&fix, &info->fix, sizeof(fix)); if (info->flags & FBINFO_HIDE_SMEM_START) fix.smem_start = 0; unlock_fb_info(info); ret = copy_to_user(argp, &fix, sizeof(fix)) ? -EFAULT : 0; break; case FBIOPUTCMAP: if (copy_from_user(&cmap, argp, sizeof(cmap))) return -EFAULT; ret = fb_set_user_cmap(&cmap, info); break; case FBIOGETCMAP: if (copy_from_user(&cmap, argp, sizeof(cmap))) return -EFAULT; lock_fb_info(info); cmap_from = info->cmap; unlock_fb_info(info); ret = fb_cmap_to_user(&cmap_from, &cmap); break; case FBIOPAN_DISPLAY: if (copy_from_user(&var, argp, sizeof(var))) return -EFAULT; console_lock(); lock_fb_info(info); ret = fb_pan_display(info, &var); unlock_fb_info(info); console_unlock(); if (ret == 0 && copy_to_user(argp, &var, sizeof(var))) return -EFAULT; break; case FBIO_CURSOR: ret = -EINVAL; break; case FBIOGET_CON2FBMAP: ret = fbcon_get_con2fb_map_ioctl(argp); break; case FBIOPUT_CON2FBMAP: ret = fbcon_set_con2fb_map_ioctl(argp); break; case FBIOBLANK: if (arg > FB_BLANK_POWERDOWN) return -EINVAL; console_lock(); lock_fb_info(info); ret = fb_blank(info, arg); /* might again call into fb_blank */ fbcon_fb_blanked(info, arg); unlock_fb_info(info); console_unlock(); break; default: lock_fb_info(info); fb = info->fbops; if (fb->fb_ioctl) ret = fb->fb_ioctl(info, cmd, arg); else ret = -ENOTTY; unlock_fb_info(info); } return ret; } static long fb_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fb_info *info = file_fb_info(file); if (!info) return -ENODEV; return do_fb_ioctl(info, cmd, arg); } #ifdef CONFIG_COMPAT struct fb_fix_screeninfo32 { char id[16]; compat_caddr_t smem_start; u32 smem_len; u32 type; u32 type_aux; u32 visual; u16 xpanstep; u16 ypanstep; u16 ywrapstep; u32 line_length; compat_caddr_t mmio_start; u32 mmio_len; u32 accel; u16 reserved[3]; }; struct fb_cmap32 { u32 start; u32 len; compat_caddr_t red; compat_caddr_t green; compat_caddr_t blue; compat_caddr_t transp; }; static int fb_getput_cmap(struct fb_info *info, unsigned int cmd, unsigned long arg) { struct fb_cmap32 cmap32; struct fb_cmap cmap_from; struct fb_cmap_user cmap; if (copy_from_user(&cmap32, compat_ptr(arg), sizeof(cmap32))) return -EFAULT; cmap = (struct fb_cmap_user) { .start = cmap32.start, .len = cmap32.len, .red = compat_ptr(cmap32.red), .green = compat_ptr(cmap32.green), .blue = compat_ptr(cmap32.blue), .transp = compat_ptr(cmap32.transp), }; if (cmd == FBIOPUTCMAP) return fb_set_user_cmap(&cmap, info); lock_fb_info(info); cmap_from = info->cmap; unlock_fb_info(info); return fb_cmap_to_user(&cmap_from, &cmap); } static int do_fscreeninfo_to_user(struct fb_fix_screeninfo *fix, struct fb_fix_screeninfo32 __user *fix32) { __u32 data; int err; err = copy_to_user(&fix32->id, &fix->id, sizeof(fix32->id)); data = (__u32) (unsigned long) fix->smem_start; err |= put_user(data, &fix32->smem_start); err |= put_user(fix->smem_len, &fix32->smem_len); err |= put_user(fix->type, &fix32->type); err |= put_user(fix->type_aux, &fix32->type_aux); err |= put_user(fix->visual, &fix32->visual); err |= put_user(fix->xpanstep, &fix32->xpanstep); err |= put_user(fix->ypanstep, &fix32->ypanstep); err |= put_user(fix->ywrapstep, &fix32->ywrapstep); err |= put_user(fix->line_length, &fix32->line_length); data = (__u32) (unsigned long) fix->mmio_start; err |= put_user(data, &fix32->mmio_start); err |= put_user(fix->mmio_len, &fix32->mmio_len); err |= put_user(fix->accel, &fix32->accel); err |= copy_to_user(fix32->reserved, fix->reserved, sizeof(fix->reserved)); if (err) return -EFAULT; return 0; } static int fb_get_fscreeninfo(struct fb_info *info, unsigned int cmd, unsigned long arg) { struct fb_fix_screeninfo fix; lock_fb_info(info); fix = info->fix; if (info->flags & FBINFO_HIDE_SMEM_START) fix.smem_start = 0; unlock_fb_info(info); return do_fscreeninfo_to_user(&fix, compat_ptr(arg)); } static long fb_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fb_info *info = file_fb_info(file); const struct fb_ops *fb; long ret = -ENOIOCTLCMD; if (!info) return -ENODEV; fb = info->fbops; switch (cmd) { case FBIOGET_VSCREENINFO: case FBIOPUT_VSCREENINFO: case FBIOPAN_DISPLAY: case FBIOGET_CON2FBMAP: case FBIOPUT_CON2FBMAP: arg = (unsigned long) compat_ptr(arg); fallthrough; case FBIOBLANK: ret = do_fb_ioctl(info, cmd, arg); break; case FBIOGET_FSCREENINFO: ret = fb_get_fscreeninfo(info, cmd, arg); break; case FBIOGETCMAP: case FBIOPUTCMAP: ret = fb_getput_cmap(info, cmd, arg); break; default: if (fb->fb_compat_ioctl) ret = fb->fb_compat_ioctl(info, cmd, arg); break; } return ret; } #endif static int fb_mmap(struct file *file, struct vm_area_struct *vma) { struct fb_info *info = file_fb_info(file); int res; if (!info) return -ENODEV; if (fb_WARN_ON_ONCE(info, !info->fbops->fb_mmap)) return -ENODEV; mutex_lock(&info->mm_lock); res = info->fbops->fb_mmap(info, vma); mutex_unlock(&info->mm_lock); return res; } static int fb_open(struct inode *inode, struct file *file) __acquires(&info->lock) __releases(&info->lock) { int fbidx = iminor(inode); struct fb_info *info; int res = 0; info = get_fb_info(fbidx); if (!info) { request_module("fb%d", fbidx); info = get_fb_info(fbidx); if (!info) return -ENODEV; } if (IS_ERR(info)) return PTR_ERR(info); lock_fb_info(info); if (!try_module_get(info->fbops->owner)) { res = -ENODEV; goto out; } file->private_data = info; if (info->fbops->fb_open) { res = info->fbops->fb_open(info, 1); if (res) module_put(info->fbops->owner); } #ifdef CONFIG_FB_DEFERRED_IO if (info->fbdefio) fb_deferred_io_open(info, inode, file); #endif out: unlock_fb_info(info); if (res) put_fb_info(info); return res; } static int fb_release(struct inode *inode, struct file *file) __acquires(&info->lock) __releases(&info->lock) { struct fb_info * const info = file->private_data; lock_fb_info(info); #if IS_ENABLED(CONFIG_FB_DEFERRED_IO) if (info->fbdefio) fb_deferred_io_release(info); #endif if (info->fbops->fb_release) info->fbops->fb_release(info, 1); module_put(info->fbops->owner); unlock_fb_info(info); put_fb_info(info); return 0; } #if defined(CONFIG_FB_PROVIDE_GET_FB_UNMAPPED_AREA) && !defined(CONFIG_MMU) static unsigned long get_fb_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct fb_info * const info = filp->private_data; unsigned long fb_size = PAGE_ALIGN(info->fix.smem_len); if (pgoff > fb_size || len > fb_size - pgoff) return -EINVAL; return (unsigned long)info->screen_base + pgoff; } #endif static const struct file_operations fb_fops = { .owner = THIS_MODULE, .read = fb_read, .write = fb_write, .unlocked_ioctl = fb_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = fb_compat_ioctl, #endif .mmap = fb_mmap, .open = fb_open, .release = fb_release, #if defined(HAVE_ARCH_FB_UNMAPPED_AREA) || \ (defined(CONFIG_FB_PROVIDE_GET_FB_UNMAPPED_AREA) && \ !defined(CONFIG_MMU)) .get_unmapped_area = get_fb_unmapped_area, #endif #ifdef CONFIG_FB_DEFERRED_IO .fsync = fb_deferred_io_fsync, #endif .llseek = default_llseek, }; int fb_register_chrdev(void) { int ret; ret = register_chrdev(FB_MAJOR, "fb", &fb_fops); if (ret) { pr_err("Unable to get major %d for fb devs\n", FB_MAJOR); return ret; } return ret; } void fb_unregister_chrdev(void) { unregister_chrdev(FB_MAJOR, "fb"); } |
| 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 11 2 3 2 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 | // SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0-only) /* Copyright(c) 2014 - 2020 Intel Corporation */ #include <crypto/algapi.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/bitops.h> #include <linux/pci.h> #include <linux/cdev.h> #include <linux/uaccess.h> #include "adf_accel_devices.h" #include "adf_common_drv.h" #include "adf_cfg.h" #include "adf_cfg_common.h" #include "adf_cfg_user.h" #define ADF_CFG_MAX_SECTION 512 #define ADF_CFG_MAX_KEY_VAL 256 #define DEVICE_NAME "qat_adf_ctl" static DEFINE_MUTEX(adf_ctl_lock); static long adf_ctl_ioctl(struct file *fp, unsigned int cmd, unsigned long arg); static const struct file_operations adf_ctl_ops = { .owner = THIS_MODULE, .unlocked_ioctl = adf_ctl_ioctl, .compat_ioctl = compat_ptr_ioctl, }; static const struct class adf_ctl_class = { .name = DEVICE_NAME, }; struct adf_ctl_drv_info { unsigned int major; struct cdev drv_cdev; }; static struct adf_ctl_drv_info adf_ctl_drv; static void adf_chr_drv_destroy(void) { device_destroy(&adf_ctl_class, MKDEV(adf_ctl_drv.major, 0)); cdev_del(&adf_ctl_drv.drv_cdev); class_unregister(&adf_ctl_class); unregister_chrdev_region(MKDEV(adf_ctl_drv.major, 0), 1); } static int adf_chr_drv_create(void) { dev_t dev_id; struct device *drv_device; int ret; if (alloc_chrdev_region(&dev_id, 0, 1, DEVICE_NAME)) { pr_err("QAT: unable to allocate chrdev region\n"); return -EFAULT; } ret = class_register(&adf_ctl_class); if (ret) goto err_chrdev_unreg; adf_ctl_drv.major = MAJOR(dev_id); cdev_init(&adf_ctl_drv.drv_cdev, &adf_ctl_ops); if (cdev_add(&adf_ctl_drv.drv_cdev, dev_id, 1)) { pr_err("QAT: cdev add failed\n"); goto err_class_destr; } drv_device = device_create(&adf_ctl_class, NULL, MKDEV(adf_ctl_drv.major, 0), NULL, DEVICE_NAME); if (IS_ERR(drv_device)) { pr_err("QAT: failed to create device\n"); goto err_cdev_del; } return 0; err_cdev_del: cdev_del(&adf_ctl_drv.drv_cdev); err_class_destr: class_unregister(&adf_ctl_class); err_chrdev_unreg: unregister_chrdev_region(dev_id, 1); return -EFAULT; } static int adf_ctl_alloc_resources(struct adf_user_cfg_ctl_data **ctl_data, unsigned long arg) { struct adf_user_cfg_ctl_data *cfg_data; cfg_data = kzalloc(sizeof(*cfg_data), GFP_KERNEL); if (!cfg_data) return -ENOMEM; /* Initialize device id to NO DEVICE as 0 is a valid device id */ cfg_data->device_id = ADF_CFG_NO_DEVICE; if (copy_from_user(cfg_data, (void __user *)arg, sizeof(*cfg_data))) { pr_err("QAT: failed to copy from user cfg_data.\n"); kfree(cfg_data); return -EIO; } *ctl_data = cfg_data; return 0; } static int adf_add_key_value_data(struct adf_accel_dev *accel_dev, const char *section, const struct adf_user_cfg_key_val *key_val) { if (key_val->type == ADF_HEX) { long *ptr = (long *)key_val->val; long val = *ptr; if (adf_cfg_add_key_value_param(accel_dev, section, key_val->key, (void *)val, key_val->type)) { dev_err(&GET_DEV(accel_dev), "failed to add hex keyvalue.\n"); return -EFAULT; } } else { if (adf_cfg_add_key_value_param(accel_dev, section, key_val->key, key_val->val, key_val->type)) { dev_err(&GET_DEV(accel_dev), "failed to add keyvalue.\n"); return -EFAULT; } } return 0; } static int adf_copy_key_value_data(struct adf_accel_dev *accel_dev, struct adf_user_cfg_ctl_data *ctl_data) { struct adf_user_cfg_key_val key_val; struct adf_user_cfg_key_val *params_head; struct adf_user_cfg_section section, *section_head; int i, j; section_head = ctl_data->config_section; for (i = 0; section_head && i < ADF_CFG_MAX_SECTION; i++) { if (copy_from_user(§ion, (void __user *)section_head, sizeof(*section_head))) { dev_err(&GET_DEV(accel_dev), "failed to copy section info\n"); goto out_err; } if (adf_cfg_section_add(accel_dev, section.name)) { dev_err(&GET_DEV(accel_dev), "failed to add section.\n"); goto out_err; } params_head = section.params; for (j = 0; params_head && j < ADF_CFG_MAX_KEY_VAL; j++) { if (copy_from_user(&key_val, (void __user *)params_head, sizeof(key_val))) { dev_err(&GET_DEV(accel_dev), "Failed to copy keyvalue.\n"); goto out_err; } if (adf_add_key_value_data(accel_dev, section.name, &key_val)) { goto out_err; } params_head = key_val.next; } section_head = section.next; } return 0; out_err: adf_cfg_del_all(accel_dev); return -EFAULT; } static int adf_ctl_ioctl_dev_config(struct file *fp, unsigned int cmd, unsigned long arg) { int ret; struct adf_user_cfg_ctl_data *ctl_data; struct adf_accel_dev *accel_dev; ret = adf_ctl_alloc_resources(&ctl_data, arg); if (ret) return ret; accel_dev = adf_devmgr_get_dev_by_id(ctl_data->device_id); if (!accel_dev) { ret = -EFAULT; goto out; } if (adf_dev_started(accel_dev)) { ret = -EFAULT; goto out; } if (adf_copy_key_value_data(accel_dev, ctl_data)) { ret = -EFAULT; goto out; } set_bit(ADF_STATUS_CONFIGURED, &accel_dev->status); out: kfree(ctl_data); return ret; } static int adf_ctl_is_device_in_use(int id) { struct adf_accel_dev *dev; list_for_each_entry(dev, adf_devmgr_get_head(), list) { if (id == dev->accel_id || id == ADF_CFG_ALL_DEVICES) { if (adf_devmgr_in_reset(dev) || adf_dev_in_use(dev)) { dev_info(&GET_DEV(dev), "device qat_dev%d is busy\n", dev->accel_id); return -EBUSY; } } } return 0; } static void adf_ctl_stop_devices(u32 id) { struct adf_accel_dev *accel_dev; list_for_each_entry(accel_dev, adf_devmgr_get_head(), list) { if (id == accel_dev->accel_id || id == ADF_CFG_ALL_DEVICES) { if (!adf_dev_started(accel_dev)) continue; /* First stop all VFs */ if (!accel_dev->is_vf) continue; adf_dev_down(accel_dev); } } list_for_each_entry(accel_dev, adf_devmgr_get_head(), list) { if (id == accel_dev->accel_id || id == ADF_CFG_ALL_DEVICES) { if (!adf_dev_started(accel_dev)) continue; adf_dev_down(accel_dev); } } } static int adf_ctl_ioctl_dev_stop(struct file *fp, unsigned int cmd, unsigned long arg) { int ret; struct adf_user_cfg_ctl_data *ctl_data; ret = adf_ctl_alloc_resources(&ctl_data, arg); if (ret) return ret; if (adf_devmgr_verify_id(ctl_data->device_id)) { pr_err("QAT: Device %d not found\n", ctl_data->device_id); ret = -ENODEV; goto out; } ret = adf_ctl_is_device_in_use(ctl_data->device_id); if (ret) goto out; if (ctl_data->device_id == ADF_CFG_ALL_DEVICES) pr_info("QAT: Stopping all acceleration devices.\n"); else pr_info("QAT: Stopping acceleration device qat_dev%d.\n", ctl_data->device_id); adf_ctl_stop_devices(ctl_data->device_id); out: kfree(ctl_data); return ret; } static int adf_ctl_ioctl_dev_start(struct file *fp, unsigned int cmd, unsigned long arg) { int ret; struct adf_user_cfg_ctl_data *ctl_data; struct adf_accel_dev *accel_dev; ret = adf_ctl_alloc_resources(&ctl_data, arg); if (ret) return ret; ret = -ENODEV; accel_dev = adf_devmgr_get_dev_by_id(ctl_data->device_id); if (!accel_dev) goto out; dev_info(&GET_DEV(accel_dev), "Starting acceleration device qat_dev%d.\n", ctl_data->device_id); ret = adf_dev_up(accel_dev, false); if (ret) { dev_err(&GET_DEV(accel_dev), "Failed to start qat_dev%d\n", ctl_data->device_id); adf_dev_down(accel_dev); } out: kfree(ctl_data); return ret; } static int adf_ctl_ioctl_get_num_devices(struct file *fp, unsigned int cmd, unsigned long arg) { u32 num_devices = 0; adf_devmgr_get_num_dev(&num_devices); if (copy_to_user((void __user *)arg, &num_devices, sizeof(num_devices))) return -EFAULT; return 0; } static int adf_ctl_ioctl_get_status(struct file *fp, unsigned int cmd, unsigned long arg) { struct adf_hw_device_data *hw_data; struct adf_dev_status_info dev_info; struct adf_accel_dev *accel_dev; if (copy_from_user(&dev_info, (void __user *)arg, sizeof(struct adf_dev_status_info))) { pr_err("QAT: failed to copy from user.\n"); return -EFAULT; } accel_dev = adf_devmgr_get_dev_by_id(dev_info.accel_id); if (!accel_dev) return -ENODEV; hw_data = accel_dev->hw_device; dev_info.state = adf_dev_started(accel_dev) ? DEV_UP : DEV_DOWN; dev_info.num_ae = hw_data->get_num_aes(hw_data); dev_info.num_accel = hw_data->get_num_accels(hw_data); dev_info.num_logical_accel = hw_data->num_logical_accel; dev_info.banks_per_accel = hw_data->num_banks / hw_data->num_logical_accel; strscpy(dev_info.name, hw_data->dev_class->name, sizeof(dev_info.name)); dev_info.instance_id = hw_data->instance_id; dev_info.type = hw_data->dev_class->type; dev_info.bus = accel_to_pci_dev(accel_dev)->bus->number; dev_info.dev = PCI_SLOT(accel_to_pci_dev(accel_dev)->devfn); dev_info.fun = PCI_FUNC(accel_to_pci_dev(accel_dev)->devfn); if (copy_to_user((void __user *)arg, &dev_info, sizeof(struct adf_dev_status_info))) { dev_err(&GET_DEV(accel_dev), "failed to copy status.\n"); return -EFAULT; } return 0; } static long adf_ctl_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { int ret; if (mutex_lock_interruptible(&adf_ctl_lock)) return -EFAULT; switch (cmd) { case IOCTL_CONFIG_SYS_RESOURCE_PARAMETERS: ret = adf_ctl_ioctl_dev_config(fp, cmd, arg); break; case IOCTL_STOP_ACCEL_DEV: ret = adf_ctl_ioctl_dev_stop(fp, cmd, arg); break; case IOCTL_START_ACCEL_DEV: ret = adf_ctl_ioctl_dev_start(fp, cmd, arg); break; case IOCTL_GET_NUM_DEVICES: ret = adf_ctl_ioctl_get_num_devices(fp, cmd, arg); break; case IOCTL_STATUS_ACCEL_DEV: ret = adf_ctl_ioctl_get_status(fp, cmd, arg); break; default: pr_err_ratelimited("QAT: Invalid ioctl %d\n", cmd); ret = -EFAULT; break; } mutex_unlock(&adf_ctl_lock); return ret; } static int __init adf_register_ctl_device_driver(void) { if (adf_chr_drv_create()) goto err_chr_dev; if (adf_init_misc_wq()) goto err_misc_wq; if (adf_init_aer()) goto err_aer; if (adf_init_pf_wq()) goto err_pf_wq; if (adf_init_vf_wq()) goto err_vf_wq; if (qat_crypto_register()) goto err_crypto_register; if (qat_compression_register()) goto err_compression_register; return 0; err_compression_register: qat_crypto_unregister(); err_crypto_register: adf_exit_vf_wq(); err_vf_wq: adf_exit_pf_wq(); err_pf_wq: adf_exit_aer(); err_aer: adf_exit_misc_wq(); err_misc_wq: adf_chr_drv_destroy(); err_chr_dev: mutex_destroy(&adf_ctl_lock); return -EFAULT; } static void __exit adf_unregister_ctl_device_driver(void) { adf_chr_drv_destroy(); adf_exit_misc_wq(); adf_exit_aer(); adf_exit_vf_wq(); adf_exit_pf_wq(); qat_crypto_unregister(); qat_compression_unregister(); adf_clean_vf_map(false); mutex_destroy(&adf_ctl_lock); } module_init(adf_register_ctl_device_driver); module_exit(adf_unregister_ctl_device_driver); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Intel"); MODULE_DESCRIPTION("Intel(R) QuickAssist Technology"); MODULE_ALIAS_CRYPTO("intel_qat"); MODULE_VERSION(ADF_DRV_VERSION); MODULE_IMPORT_NS("CRYPTO_INTERNAL"); |
| 3 3 6 3 3 3 3 155 154 112 5984 6029 5985 6036 395 1956 1956 1521 5931 5972 1506 1907 443 444 441 5190 5215 1290 1479 200 199 190 11 11 7 7 4 4 4 4 4 4 9 8 4 4 4 4 1 1 8 9 2 2 2 12 1 2 3 8 2 7 1 3 1 4 1 2 2 2 2 10 3 3 2 2 8 8 2 2 1 1 2 1 7 2 1 4 1 1 4 4 148 158 9 9 5 5 5 5 5 5 3 4 8 2 6 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); } static void dec_user_namespaces(struct ucounts *ucounts) { return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); } static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing * anything as the capabilities are bound to the new user namespace. */ cred->securebits = SECUREBITS_DEFAULT; cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ cred->user_ns = user_ns; } static unsigned long enforced_nproc_rlimit(void) { unsigned long limit = RLIM_INFINITY; /* Is RLIMIT_NPROC currently enforced? */ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || (current_user_ns() != &init_user_ns)) limit = rlimit(RLIMIT_NPROC); return limit; } /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the * new namespace. * * This is called by copy_creds(), which will finish setting the target task's * credentials. */ int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; int ret, i; ret = -ENOSPC; if (parent_ns->level > 32) goto fail; ucounts = inc_user_namespaces(parent_ns, owner); if (!ucounts) goto fail; /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, * by verifying that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; if (current_chrooted()) goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) goto fail_dec; ret = security_create_user_ns(new); if (ret < 0) goto fail_dec; ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; ns->ns.ops = &userns_operations; refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); #ifdef CONFIG_KEYS INIT_LIST_HEAD(&ns->keyring_name_list); init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) goto fail_keyring; set_cred_user_ns(new, ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: dec_user_namespaces(ucounts); fail: return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); if (cred) { err = create_user_ns(cred); if (err) put_cred(cred); else *new_cred = cred; } return err; } static void free_user_ns(struct work_struct *work) { struct user_namespace *parent, *ns = container_of(work, struct user_namespace, work); do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); } if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->uid_map.forward); kfree(ns->uid_map.reverse); } if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } #if IS_ENABLED(CONFIG_BINFMT_MISC) kfree(ns->binfmt_misc); #endif retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); ns = parent; } while (refcount_dec_and_test(&parent->ns.count)); } void __put_user_ns(struct user_namespace *ns) { schedule_work(&ns->work); } EXPORT_SYMBOL(__put_user_ns); /* * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ u32 count; /* == 0 unless used with map_id_range_down() */ }; /* * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ static int cmp_map_id(const void *k, const void *e) { u32 first, last, id2; const struct idmap_key *key = k; const struct uid_gid_extent *el = e; id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) first = el->lower_first; else first = el->first; last = first + el->count - 1; if (key->id >= first && key->id <= last && (id2 >= first && id2 <= last)) return 0; if (key->id < first || id2 < first) return -1; return 1; } /* * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = false; key.count = count; key.id = id; return bsearch(&key, map->forward, extents, sizeof(struct uid_gid_extent), cmp_map_id); } /* * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_down_base(extents, map, id, count); else extent = map_id_range_down_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; else id = (u32) -1; return id; } u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } /* * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) { unsigned idx; u32 first, last; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) return &map->extent[idx]; } return NULL; } /* * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) { struct idmap_key key; key.map_up = true; key.count = 1; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } u32 map_id_up(struct uid_gid_map *map, u32 id) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_up_base(extents, map, id); else extent = map_id_up_max(extents, map, id); /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; else id = (u32) -1; return id; } /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in * @uid: User identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) { /* Map the uid to a global kernel uid */ return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); } EXPORT_SYMBOL(make_kuid); /** * from_kuid - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * If @kuid has no mapping in @targ (uid_t)-1 is returned. */ uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->uid_map, __kuid_val(kuid)); } EXPORT_SYMBOL(from_kuid); /** * from_kuid_munged - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kuid from_kuid_munged never fails and always * returns a valid uid. This makes from_kuid_munged appropriate * for use in syscalls like stat and getuid where failing the * system call and failing to provide a valid uid are not an * options. * * If @kuid has no mapping in @targ overflowuid is returned. */ uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) { uid_t uid; uid = from_kuid(targ, kuid); if (uid == (uid_t) -1) uid = overflowuid; return uid; } EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. * * When there is no mapping defined for the user-namespace gid * pair INVALID_GID is returned. Callers are expected to test * for and handle INVALID_GID being returned. INVALID_GID may be * tested for using gid_valid(). */ kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { /* Map the gid to a global kernel gid */ return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); } EXPORT_SYMBOL(make_kgid); /** * from_kgid - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * If @kgid has no mapping in @targ (gid_t)-1 is returned. */ gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) { /* Map the gid from a global kernel gid */ return map_id_up(&targ->gid_map, __kgid_val(kgid)); } EXPORT_SYMBOL(from_kgid); /** * from_kgid_munged - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kgid from_kgid_munged never fails and always * returns a valid gid. This makes from_kgid_munged appropriate * for use in syscalls like stat and getgid where failing the * system call and failing to provide a valid gid are not options. * * If @kgid has no mapping in @targ overflowgid is returned. */ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) { gid_t gid; gid = from_kgid(targ, kgid); if (gid == (gid_t) -1) gid = overflowgid; return gid; } EXPORT_SYMBOL(from_kgid_munged); /** * make_kprojid - Map a user-namespace projid pair into a kprojid. * @ns: User namespace that the projid is in * @projid: Project identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) { /* Map the uid to a global kernel uid */ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); } EXPORT_SYMBOL(make_kprojid); /** * from_kprojid - Create a projid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal project identifier to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * If @kprojid has no mapping in @targ (projid_t)-1 is returned. */ projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); } EXPORT_SYMBOL(from_kprojid); /** * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal projid to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kprojid from_kprojid_munged never fails and always * returns a valid projid. This makes from_kprojid_munged * appropriate for use in syscalls like stat and where * failing the system call and failing to provide a valid projid are * not an options. * * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. */ projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) { projid_t projid; projid = from_kprojid(targ, kprojid); if (projid == (projid_t) -1) projid = OVERFLOW_PROJID; return projid; } EXPORT_SYMBOL(from_kprojid_munged); static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; uid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int gid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; gid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int projid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; projid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; unsigned extents = map->nr_extents; smp_rmb(); if (pos >= extents) return NULL; if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->uid_map); } static void *gid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->gid_map); } static void *projid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->projid_map); } static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return seq->op->start(seq, pos); } static void m_stop(struct seq_file *seq, void *v) { return; } const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, .show = projid_m_show, }; static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; upper_first = extent->first; lower_first = extent->lower_first; upper_last = upper_first + extent->count - 1; lower_last = lower_first + extent->count - 1; for (idx = 0; idx < new_map->nr_extents; idx++) { u32 prev_upper_first, prev_lower_first; u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) prev = &new_map->extent[idx]; else prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; prev_upper_last = prev_upper_first + prev->count - 1; prev_lower_last = prev_lower_first + prev->count - 1; /* Does the upper range intersect a previous extent? */ if ((prev_upper_first <= upper_last) && (prev_upper_last >= upper_first)) return true; /* Does the lower range intersect a previous extent? */ if ((prev_lower_first <= lower_last) && (prev_lower_last >= lower_first)) return true; } return false; } /* * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!forward) return -ENOMEM; /* Copy over memory. Only set up memory for the forward pointer. * Defer the memory setup for the reverse pointer. */ memcpy(forward, map->extent, map->nr_extents * sizeof(map->extent[0])); map->forward = forward; map->reverse = NULL; } if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) dest = &map->extent[map->nr_extents]; else dest = &map->forward[map->nr_extents]; *dest = *extent; map->nr_extents++; return 0; } /* cmp function to sort() forward mappings */ static int cmp_extents_forward(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->first < e2->first) return -1; if (e1->first > e2->first) return 1; return 0; } /* cmp function to sort() reverse mappings */ static int cmp_extents_reverse(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->lower_first < e2->lower_first) return -1; if (e1->lower_first > e2->lower_first) return 1; return 0; } /* * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static int sort_idmaps(struct uid_gid_map *map) { if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return 0; /* Sort forward array. */ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_forward, NULL); /* Only copy the memory from forward we actually need. */ map->reverse = kmemdup_array(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!map->reverse) return -ENOMEM; /* Sort reverse array. */ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_reverse, NULL); return 0; } /** * verify_root_map() - check the uid 0 mapping * @file: idmapping file * @map_ns: user namespace of the target process * @new_map: requested idmap * * If a process requests mapping parent uid 0 into the new ns, verify that the * process writing the map had the CAP_SETFCAP capability as the target process * will be able to write fscaps that are valid in ancestor user namespaces. * * Return: true if the mapping is allowed, false if not. */ static bool verify_root_map(const struct file *file, struct user_namespace *map_ns, struct uid_gid_map *new_map) { int idx; const struct user_namespace *file_ns = file->f_cred->user_ns; struct uid_gid_extent *extent0 = NULL; for (idx = 0; idx < new_map->nr_extents; idx++) { if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent0 = &new_map->extent[idx]; else extent0 = &new_map->forward[idx]; if (extent0->lower_first == 0) break; extent0 = NULL; } if (!extent0) return true; if (map_ns == file_ns) { /* The process unshared its ns and is writing to its own * /proc/self/uid_map. User already has full capabilites in * the new namespace. Verify that the parent had CAP_SETFCAP * when it unshared. * */ if (!file_ns->parent_could_setfcap) return false; } else { /* Process p1 is writing to uid_map of p2, who is in a child * user namespace to p1's. Verify that the opener of the map * file has CAP_SETFCAP against the parent of the new map * namespace */ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) return false; } return true; } static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, struct uid_gid_map *map, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; char *kbuf, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * * An id map fits within 1 cache line on most architectures. * * On read nothing needs to be done unless you are on an * architecture with a crazy cache coherency model like alpha. * * There is a one time data dependency between reading the * count of the extents and the values of the extents. The * desired behavior is to see the values of the extents that * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ mutex_lock(&userns_state_mutex); memset(&new_map, 0, sizeof(struct uid_gid_map)); ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) goto out; /* * Adjusting namespace settings requires capabilities on the target. */ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ ret = -EINVAL; pos = kbuf; for (; pos; pos = next_line) { /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } pos = skip_spaces(pos); extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; /* Verify we have been given valid starting values */ if ((extent.first == (u32) -1) || (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ if ((extent.first + extent.count) <= extent.first) goto out; if ((extent.lower_first + extent.count) <= extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ if (mappings_overlap(&new_map, &extent)) goto out; if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; ret = insert_extent(&new_map, &extent); if (ret < 0) goto out; ret = -EINVAL; } /* Be very certain the new map actually exists */ if (new_map.nr_extents == 0) goto out; ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { struct uid_gid_extent *e; u32 lower_first; if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) e = &new_map.extent[idx]; else e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, e->lower_first, e->count); /* Fail if we can not map the specified extent to * the kernel global id space. */ if (lower_first == (u32) -1) goto out; e->lower_first = lower_first; } /* * If we want to use binary search for lookup, this clones the extent * array and sorts both copies. */ ret = sort_idmaps(&new_map); if (ret < 0) goto out; /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, new_map.nr_extents * sizeof(new_map.extent[0])); } else { map->forward = new_map.forward; map->reverse = new_map.reverse; } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(new_map.forward); kfree(new_map.reverse); map->forward = NULL; map->reverse = NULL; map->nr_extents = 0; } mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; } ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; /* Anyone can set any valid project id no capability needed */ return map_write(file, buf, size, ppos, -1, &ns->projid_map, &ns->parent->projid_map); } static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) return false; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && gid_eq(gid, cred->egid)) return true; } } /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. * And the opener of the id file also has the appropriate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) return true; return false; } int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? "allow" : "deny"); return 0; } ssize_t proc_setgroups_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; char kbuf[8], *pos; bool setgroups_allowed; ssize_t ret; /* Only allow a very narrow range of strings to be written */ ret = -EINVAL; if ((*ppos != 0) || (count >= sizeof(kbuf))) goto out; /* What was written? */ ret = -EFAULT; if (copy_from_user(kbuf, buf, count)) goto out; kbuf[count] = '\0'; pos = kbuf; /* What is being requested? */ ret = -EINVAL; if (strncmp(pos, "allow", 5) == 0) { pos += 5; setgroups_allowed = true; } else if (strncmp(pos, "deny", 4) == 0) { pos += 4; setgroups_allowed = false; } else goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; ret = -EPERM; mutex_lock(&userns_state_mutex); if (setgroups_allowed) { /* Enabling setgroups after setgroups has been disabled * is not allowed. */ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) goto out_unlock; } else { /* Permanently disabling setgroups after setgroups has * been enabled by writing the gid_map is not allowed. */ if (ns->gid_map.nr_extents != 0) goto out_unlock; ns->flags &= ~USERNS_SETGROUPS_ALLOWED; } mutex_unlock(&userns_state_mutex); /* Report a successful write */ *ppos = count; ret = count; out: return ret; out_unlock: mutex_unlock(&userns_state_mutex); goto out; } bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; /* Is setgroups allowed? */ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; } /* * Returns true if @child is the same namespace or a descendant of * @ancestor. */ bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { const struct user_namespace *ns; for (ns = child; ns->level > ancestor->level; ns = ns->parent) ; return (ns == ancestor); } bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; rcu_read_lock(); user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); return user_ns ? &user_ns->ns : NULL; } static void userns_put(struct ns_common *ns) { put_user_ns(to_user_ns(ns)); } static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering * the same user namespace. */ if (user_ns == current_user_ns()) return -EINVAL; /* Tasks that share a thread group must share a user namespace */ if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) return -EINVAL; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; cred = nsset_cred(nsset); if (!cred) return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); if (set_cred_ucounts(cred) < 0) return -EINVAL; return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) { struct user_namespace *my_user_ns = current_user_ns(); struct user_namespace *owner, *p; /* See if the owner is in the current user namespace */ owner = p = ns->ops->owner(ns); for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == my_user_ns) break; p = p->parent; } return &get_user_ns(owner)->ns; } static struct user_namespace *userns_owner(struct ns_common *ns) { return to_user_ns(ns)->parent; } const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, .owner = userns_owner, .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init); |
| 710 1241 710 1246 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | // SPDX-License-Identifier: GPL-2.0-only /* * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/route.h> #include <linux/ip.h> #include <net/ip.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("iptables mangle table"); #define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ (1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_MANGLE, }; static unsigned int ipt_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret, verdict; const struct iphdr *iph; __be32 saddr, daddr; u32 mark; int err; u8 tos; /* Save things which could affect route */ mark = skb->mark; iph = ip_hdr(skb); saddr = iph->saddr; daddr = iph->daddr; tos = iph->tos; ret = ipt_do_table(priv, skb, state); verdict = ret & NF_VERDICT_MASK; /* Reroute for ANY change. */ if (verdict != NF_DROP && verdict != NF_STOLEN) { iph = ip_hdr(skb); if (iph->saddr != saddr || iph->daddr != daddr || skb->mark != mark || iph->tos != tos) { err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } } return ret; } /* The work comes in here from netfilter.c. */ static unsigned int iptable_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { if (state->hook == NF_INET_LOCAL_OUT) return ipt_mangle_out(priv, skb, state); return ipt_do_table(priv, skb, state); } static struct nf_hook_ops *mangle_ops __read_mostly; static int iptable_mangle_table_init(struct net *net) { struct ipt_replace *repl; int ret; repl = ipt_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops); kfree(repl); return ret; } static void __net_exit iptable_mangle_net_pre_exit(struct net *net) { ipt_unregister_table_pre_exit(net, "mangle"); } static void __net_exit iptable_mangle_net_exit(struct net *net) { ipt_unregister_table_exit(net, "mangle"); } static struct pernet_operations iptable_mangle_net_ops = { .pre_exit = iptable_mangle_net_pre_exit, .exit = iptable_mangle_net_exit, }; static int __init iptable_mangle_init(void) { int ret = xt_register_template(&packet_mangler, iptable_mangle_table_init); if (ret < 0) return ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook); if (IS_ERR(mangle_ops)) { xt_unregister_template(&packet_mangler); ret = PTR_ERR(mangle_ops); return ret; } ret = register_pernet_subsys(&iptable_mangle_net_ops); if (ret < 0) { xt_unregister_template(&packet_mangler); kfree(mangle_ops); return ret; } return ret; } static void __exit iptable_mangle_fini(void) { unregister_pernet_subsys(&iptable_mangle_net_ops); xt_unregister_template(&packet_mangler); kfree(mangle_ops); } module_init(iptable_mangle_init); module_exit(iptable_mangle_fini); |
| 10 88 1457 1456 1456 1456 441 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * vma.h * * Core VMA manipulation API implemented in vma.c. */ #ifndef __MM_VMA_H #define __MM_VMA_H /* * VMA lock generalization */ struct vma_prepare { struct vm_area_struct *vma; struct vm_area_struct *adj_next; struct file *file; struct address_space *mapping; struct anon_vma *anon_vma; struct vm_area_struct *insert; struct vm_area_struct *remove; struct vm_area_struct *remove2; }; struct unlink_vma_file_batch { int count; struct vm_area_struct *vmas[8]; }; /* * vma munmap operation */ struct vma_munmap_struct { struct vma_iterator *vmi; struct vm_area_struct *vma; /* The first vma to munmap */ struct vm_area_struct *prev; /* vma before the munmap area */ struct vm_area_struct *next; /* vma after the munmap area */ struct list_head *uf; /* Userfaultfd list_head */ unsigned long start; /* Aligned start addr (inclusive) */ unsigned long end; /* Aligned end addr (exclusive) */ unsigned long unmap_start; /* Unmap PTE start */ unsigned long unmap_end; /* Unmap PTE end */ int vma_count; /* Number of vmas that will be removed */ bool unlock; /* Unlock after the munmap */ bool clear_ptes; /* If there are outstanding PTE to be cleared */ /* 2 byte hole */ unsigned long nr_pages; /* Number of pages being removed */ unsigned long locked_vm; /* Number of locked pages */ unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */ unsigned long exec_vm; unsigned long stack_vm; unsigned long data_vm; }; enum vma_merge_state { VMA_MERGE_START, VMA_MERGE_ERROR_NOMEM, VMA_MERGE_NOMERGE, VMA_MERGE_SUCCESS, }; enum vma_merge_flags { VMG_FLAG_DEFAULT = 0, /* * If we can expand, simply do so. We know there is nothing to merge to * the right. Does not reset state upon failure to merge. The VMA * iterator is assumed to be positioned at the previous VMA, rather than * at the gap. */ VMG_FLAG_JUST_EXPAND = 1 << 0, }; /* Represents a VMA merge operation. */ struct vma_merge_struct { struct mm_struct *mm; struct vma_iterator *vmi; pgoff_t pgoff; struct vm_area_struct *prev; struct vm_area_struct *next; /* Modified by vma_merge(). */ struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */ unsigned long start; unsigned long end; unsigned long flags; struct file *file; struct anon_vma *anon_vma; struct mempolicy *policy; struct vm_userfaultfd_ctx uffd_ctx; struct anon_vma_name *anon_name; enum vma_merge_flags merge_flags; enum vma_merge_state state; }; static inline bool vmg_nomem(struct vma_merge_struct *vmg) { return vmg->state == VMA_MERGE_ERROR_NOMEM; } /* Assumes addr >= vma->vm_start. */ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, unsigned long addr) { return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start); } #define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_) \ struct vma_merge_struct name = { \ .mm = mm_, \ .vmi = vmi_, \ .start = start_, \ .end = end_, \ .flags = flags_, \ .pgoff = pgoff_, \ .state = VMA_MERGE_START, \ .merge_flags = VMG_FLAG_DEFAULT, \ } #define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \ struct vma_merge_struct name = { \ .mm = vma_->vm_mm, \ .vmi = vmi_, \ .prev = prev_, \ .next = NULL, \ .vma = vma_, \ .start = start_, \ .end = end_, \ .flags = vma_->vm_flags, \ .pgoff = vma_pgoff_offset(vma_, start_), \ .file = vma_->vm_file, \ .anon_vma = vma_->anon_vma, \ .policy = vma_policy(vma_), \ .uffd_ctx = vma_->vm_userfaultfd_ctx, \ .anon_name = anon_vma_name(vma_), \ .state = VMA_MERGE_START, \ .merge_flags = VMG_FLAG_DEFAULT, \ } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE void validate_mm(struct mm_struct *mm); #else #define validate_mm(mm) do { } while (0) #endif __must_check int vma_expand(struct vma_merge_struct *vmg); __must_check int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff); static inline int vma_iter_store_gfp(struct vma_iterator *vmi, struct vm_area_struct *vma, gfp_t gfp) { if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_gfp(&vmi->mas, vma, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; return 0; } int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool unlock); int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); void remove_vma(struct vm_area_struct *vma, bool unreachable); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); /* We are about to modify the VMA's flags. */ __must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags); /* We are about to modify the VMA's flags and/or anon_name. */ __must_check struct vm_area_struct *vma_modify_flags_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, struct anon_vma_name *new_name); /* We are about to modify the VMA's memory policy. */ __must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct mempolicy *new_pol); /* We are about to modify the VMA's flags and/or uffd context. */ __must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, struct vm_userfaultfd_ctx new_ctx); __must_check struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); __must_check struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long delta); void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb); void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, struct vm_area_struct *vma); void unlink_file_vma(struct vm_area_struct *vma); void vma_link_file(struct vm_area_struct *vma); int vma_link(struct mm_struct *mm, struct vm_area_struct *vma); struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks); struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma); bool vma_needs_dirty_tracking(struct vm_area_struct *vma); bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); int mm_take_all_locks(struct mm_struct *mm); void mm_drop_all_locks(struct mm_struct *mm); unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, unsigned long addr, unsigned long request, unsigned long flags); unsigned long unmapped_area(struct vm_unmapped_area_info *info); unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) { /* * We want to check manually if we can change individual PTEs writable * if we can't do that automatically for all PTEs in a mapping. For * private mappings, that's always the case when we have write * permissions as we properly have to handle COW. */ if (vma->vm_flags & VM_SHARED) return vma_wants_writenotify(vma, vma->vm_page_prot); return !!(vma->vm_flags & VM_WRITE); } #ifdef CONFIG_MMU static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); } #endif static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, unsigned long min) { return mas_prev(&vmi->mas, min); } /* * These three helpers classifies VMAs for virtual memory accounting. */ /* * Executable code area - executable, not writable, not stack */ static inline bool is_exec_mapping(vm_flags_t flags) { return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; } /* * Stack area (including shadow stacks) * * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: * do_mmap() forbids all other combinations. */ static inline bool is_stack_mapping(vm_flags_t flags) { return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); } /* * Data area - private, writable, not stack */ static inline bool is_data_mapping(vm_flags_t flags) { return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; } static inline void vma_iter_config(struct vma_iterator *vmi, unsigned long index, unsigned long last) { __mas_set_range(&vmi->mas, index, last - 1); } static inline void vma_iter_reset(struct vma_iterator *vmi) { mas_reset(&vmi->mas); } static inline struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) { return mas_prev_range(&vmi->mas, min); } static inline struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) { return mas_next_range(&vmi->mas, max); } static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, unsigned long max, unsigned long size) { return mas_empty_area(&vmi->mas, min, max - 1, size); } static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, unsigned long max, unsigned long size) { return mas_empty_area_rev(&vmi->mas, min, max - 1, size); } /* * VMA Iterator functions shared between nommu and mmap */ static inline int vma_iter_prealloc(struct vma_iterator *vmi, struct vm_area_struct *vma) { return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); } static inline void vma_iter_clear(struct vma_iterator *vmi) { mas_store_prealloc(&vmi->mas, NULL); } static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) { return mas_walk(&vmi->mas); } /* Store a VMA with preallocated memory */ static inline void vma_iter_store(struct vma_iterator *vmi, struct vm_area_struct *vma) { #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.index > vma->vm_start)) { pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", vmi->mas.index, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.last < vma->vm_start)) { pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } #endif if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_prealloc(&vmi->mas, vma); } static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) { return vmi->mas.index; } static inline unsigned long vma_iter_end(struct vma_iterator *vmi) { return vmi->mas.last + 1; } static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, unsigned long count) { return mas_expected_entries(&vmi->mas, count); } static inline struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) { return mas_prev_range(&vmi->mas, 0); } /* * Retrieve the next VMA and rewind the iterator to end of the previous VMA, or * if no previous VMA, to index 0. */ static inline struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi, struct vm_area_struct **pprev) { struct vm_area_struct *next = vma_next(vmi); struct vm_area_struct *prev = vma_prev(vmi); /* * Consider the case where no previous VMA exists. We advance to the * next VMA, skipping any gap, then rewind to the start of the range. * * If we were to unconditionally advance to the next range we'd wind up * at the next VMA again, so we check to ensure there is a previous VMA * to skip over. */ if (prev) vma_iter_next_range(vmi); if (pprev) *pprev = prev; return next; } #ifdef CONFIG_64BIT static inline bool vma_is_sealed(struct vm_area_struct *vma) { return (vma->vm_flags & VM_SEALED); } /* * check if a vma is sealed for modification. * return true, if modification is allowed. */ static inline bool can_modify_vma(struct vm_area_struct *vma) { if (unlikely(vma_is_sealed(vma))) return false; return true; } bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior); #else static inline bool can_modify_vma(struct vm_area_struct *vma) { return true; } static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) { return true; } #endif #if defined(CONFIG_STACK_GROWSUP) int expand_upwards(struct vm_area_struct *vma, unsigned long address); #endif int expand_downwards(struct vm_area_struct *vma, unsigned long address); int __vm_munmap(unsigned long start, size_t len, bool unlock); #endif /* __MM_VMA_H */ |
| 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 | // SPDX-License-Identifier: GPL-2.0-only /* binder.c * * Android IPC Subsystem * * Copyright (C) 2007-2008 Google, Inc. */ /* * Locking overview * * There are 3 main spinlocks which must be acquired in the * order shown: * * 1) proc->outer_lock : protects binder_ref * binder_proc_lock() and binder_proc_unlock() are * used to acq/rel. * 2) node->lock : protects most fields of binder_node. * binder_node_lock() and binder_node_unlock() are * used to acq/rel * 3) proc->inner_lock : protects the thread and node lists * (proc->threads, proc->waiting_threads, proc->nodes) * and all todo lists associated with the binder_proc * (proc->todo, thread->todo, proc->delivered_death and * node->async_todo), as well as thread->transaction_stack * binder_inner_proc_lock() and binder_inner_proc_unlock() * are used to acq/rel * * Any lock under procA must never be nested under any lock at the same * level or below on procB. * * Functions that require a lock held on entry indicate which lock * in the suffix of the function name: * * foo_olocked() : requires node->outer_lock * foo_nlocked() : requires node->lock * foo_ilocked() : requires proc->inner_lock * foo_oilocked(): requires proc->outer_lock and proc->inner_lock * foo_nilocked(): requires node->lock and proc->inner_lock * ... */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/fdtable.h> #include <linux/file.h> #include <linux/freezer.h> #include <linux/fs.h> #include <linux/list.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/nsproxy.h> #include <linux/poll.h> #include <linux/debugfs.h> #include <linux/rbtree.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/seq_file.h> #include <linux/string.h> #include <linux/uaccess.h> #include <linux/pid_namespace.h> #include <linux/security.h> #include <linux/spinlock.h> #include <linux/ratelimit.h> #include <linux/syscalls.h> #include <linux/task_work.h> #include <linux/sizes.h> #include <linux/ktime.h> #include <uapi/linux/android/binder.h> #include <linux/cacheflush.h> #include "binder_internal.h" #include "binder_trace.h" static HLIST_HEAD(binder_deferred_list); static DEFINE_MUTEX(binder_deferred_lock); static HLIST_HEAD(binder_devices); static HLIST_HEAD(binder_procs); static DEFINE_MUTEX(binder_procs_lock); static HLIST_HEAD(binder_dead_nodes); static DEFINE_SPINLOCK(binder_dead_nodes_lock); static struct dentry *binder_debugfs_dir_entry_root; static struct dentry *binder_debugfs_dir_entry_proc; static atomic_t binder_last_id; static int proc_show(struct seq_file *m, void *unused); DEFINE_SHOW_ATTRIBUTE(proc); #define FORBIDDEN_MMAP_FLAGS (VM_WRITE) enum { BINDER_DEBUG_USER_ERROR = 1U << 0, BINDER_DEBUG_FAILED_TRANSACTION = 1U << 1, BINDER_DEBUG_DEAD_TRANSACTION = 1U << 2, BINDER_DEBUG_OPEN_CLOSE = 1U << 3, BINDER_DEBUG_DEAD_BINDER = 1U << 4, BINDER_DEBUG_DEATH_NOTIFICATION = 1U << 5, BINDER_DEBUG_READ_WRITE = 1U << 6, BINDER_DEBUG_USER_REFS = 1U << 7, BINDER_DEBUG_THREADS = 1U << 8, BINDER_DEBUG_TRANSACTION = 1U << 9, BINDER_DEBUG_TRANSACTION_COMPLETE = 1U << 10, BINDER_DEBUG_FREE_BUFFER = 1U << 11, BINDER_DEBUG_INTERNAL_REFS = 1U << 12, BINDER_DEBUG_PRIORITY_CAP = 1U << 13, BINDER_DEBUG_SPINLOCKS = 1U << 14, }; static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR | BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION; module_param_named(debug_mask, binder_debug_mask, uint, 0644); char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; module_param_named(devices, binder_devices_param, charp, 0444); static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait); static int binder_stop_on_user_error; static int binder_set_stop_on_user_error(const char *val, const struct kernel_param *kp) { int ret; ret = param_set_int(val, kp); if (binder_stop_on_user_error < 2) wake_up(&binder_user_error_wait); return ret; } module_param_call(stop_on_user_error, binder_set_stop_on_user_error, param_get_int, &binder_stop_on_user_error, 0644); static __printf(2, 3) void binder_debug(int mask, const char *format, ...) { struct va_format vaf; va_list args; if (binder_debug_mask & mask) { va_start(args, format); vaf.va = &args; vaf.fmt = format; pr_info_ratelimited("%pV", &vaf); va_end(args); } } #define binder_txn_error(x...) \ binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, x) static __printf(1, 2) void binder_user_error(const char *format, ...) { struct va_format vaf; va_list args; if (binder_debug_mask & BINDER_DEBUG_USER_ERROR) { va_start(args, format); vaf.va = &args; vaf.fmt = format; pr_info_ratelimited("%pV", &vaf); va_end(args); } if (binder_stop_on_user_error) binder_stop_on_user_error = 2; } #define binder_set_extended_error(ee, _id, _command, _param) \ do { \ (ee)->id = _id; \ (ee)->command = _command; \ (ee)->param = _param; \ } while (0) #define to_flat_binder_object(hdr) \ container_of(hdr, struct flat_binder_object, hdr) #define to_binder_fd_object(hdr) container_of(hdr, struct binder_fd_object, hdr) #define to_binder_buffer_object(hdr) \ container_of(hdr, struct binder_buffer_object, hdr) #define to_binder_fd_array_object(hdr) \ container_of(hdr, struct binder_fd_array_object, hdr) static struct binder_stats binder_stats; static inline void binder_stats_deleted(enum binder_stat_types type) { atomic_inc(&binder_stats.obj_deleted[type]); } static inline void binder_stats_created(enum binder_stat_types type) { atomic_inc(&binder_stats.obj_created[type]); } struct binder_transaction_log_entry { int debug_id; int debug_id_done; int call_type; int from_proc; int from_thread; int target_handle; int to_proc; int to_thread; int to_node; int data_size; int offsets_size; int return_error_line; uint32_t return_error; uint32_t return_error_param; char context_name[BINDERFS_MAX_NAME + 1]; }; struct binder_transaction_log { atomic_t cur; bool full; struct binder_transaction_log_entry entry[32]; }; static struct binder_transaction_log binder_transaction_log; static struct binder_transaction_log binder_transaction_log_failed; static struct binder_transaction_log_entry *binder_transaction_log_add( struct binder_transaction_log *log) { struct binder_transaction_log_entry *e; unsigned int cur = atomic_inc_return(&log->cur); if (cur >= ARRAY_SIZE(log->entry)) log->full = true; e = &log->entry[cur % ARRAY_SIZE(log->entry)]; WRITE_ONCE(e->debug_id_done, 0); /* * write-barrier to synchronize access to e->debug_id_done. * We make sure the initialized 0 value is seen before * memset() other fields are zeroed by memset. */ smp_wmb(); memset(e, 0, sizeof(*e)); return e; } enum binder_deferred_state { BINDER_DEFERRED_FLUSH = 0x01, BINDER_DEFERRED_RELEASE = 0x02, }; enum { BINDER_LOOPER_STATE_REGISTERED = 0x01, BINDER_LOOPER_STATE_ENTERED = 0x02, BINDER_LOOPER_STATE_EXITED = 0x04, BINDER_LOOPER_STATE_INVALID = 0x08, BINDER_LOOPER_STATE_WAITING = 0x10, BINDER_LOOPER_STATE_POLL = 0x20, }; /** * binder_proc_lock() - Acquire outer lock for given binder_proc * @proc: struct binder_proc to acquire * * Acquires proc->outer_lock. Used to protect binder_ref * structures associated with the given proc. */ #define binder_proc_lock(proc) _binder_proc_lock(proc, __LINE__) static void _binder_proc_lock(struct binder_proc *proc, int line) __acquires(&proc->outer_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&proc->outer_lock); } /** * binder_proc_unlock() - Release outer lock for given binder_proc * @proc: struct binder_proc to acquire * * Release lock acquired via binder_proc_lock() */ #define binder_proc_unlock(proc) _binder_proc_unlock(proc, __LINE__) static void _binder_proc_unlock(struct binder_proc *proc, int line) __releases(&proc->outer_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_unlock(&proc->outer_lock); } /** * binder_inner_proc_lock() - Acquire inner lock for given binder_proc * @proc: struct binder_proc to acquire * * Acquires proc->inner_lock. Used to protect todo lists */ #define binder_inner_proc_lock(proc) _binder_inner_proc_lock(proc, __LINE__) static void _binder_inner_proc_lock(struct binder_proc *proc, int line) __acquires(&proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&proc->inner_lock); } /** * binder_inner_proc_unlock() - Release inner lock for given binder_proc * @proc: struct binder_proc to acquire * * Release lock acquired via binder_inner_proc_lock() */ #define binder_inner_proc_unlock(proc) _binder_inner_proc_unlock(proc, __LINE__) static void _binder_inner_proc_unlock(struct binder_proc *proc, int line) __releases(&proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_unlock(&proc->inner_lock); } /** * binder_node_lock() - Acquire spinlock for given binder_node * @node: struct binder_node to acquire * * Acquires node->lock. Used to protect binder_node fields */ #define binder_node_lock(node) _binder_node_lock(node, __LINE__) static void _binder_node_lock(struct binder_node *node, int line) __acquires(&node->lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&node->lock); } /** * binder_node_unlock() - Release spinlock for given binder_proc * @node: struct binder_node to acquire * * Release lock acquired via binder_node_lock() */ #define binder_node_unlock(node) _binder_node_unlock(node, __LINE__) static void _binder_node_unlock(struct binder_node *node, int line) __releases(&node->lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_unlock(&node->lock); } /** * binder_node_inner_lock() - Acquire node and inner locks * @node: struct binder_node to acquire * * Acquires node->lock. If node->proc also acquires * proc->inner_lock. Used to protect binder_node fields */ #define binder_node_inner_lock(node) _binder_node_inner_lock(node, __LINE__) static void _binder_node_inner_lock(struct binder_node *node, int line) __acquires(&node->lock) __acquires(&node->proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&node->lock); if (node->proc) binder_inner_proc_lock(node->proc); else /* annotation for sparse */ __acquire(&node->proc->inner_lock); } /** * binder_node_inner_unlock() - Release node and inner locks * @node: struct binder_node to acquire * * Release lock acquired via binder_node_lock() */ #define binder_node_inner_unlock(node) _binder_node_inner_unlock(node, __LINE__) static void _binder_node_inner_unlock(struct binder_node *node, int line) __releases(&node->lock) __releases(&node->proc->inner_lock) { struct binder_proc *proc = node->proc; binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); if (proc) binder_inner_proc_unlock(proc); else /* annotation for sparse */ __release(&node->proc->inner_lock); spin_unlock(&node->lock); } static bool binder_worklist_empty_ilocked(struct list_head *list) { return list_empty(list); } /** * binder_worklist_empty() - Check if no items on the work list * @proc: binder_proc associated with list * @list: list to check * * Return: true if there are no items on list, else false */ static bool binder_worklist_empty(struct binder_proc *proc, struct list_head *list) { bool ret; binder_inner_proc_lock(proc); ret = binder_worklist_empty_ilocked(list); binder_inner_proc_unlock(proc); return ret; } /** * binder_enqueue_work_ilocked() - Add an item to the work list * @work: struct binder_work to add to list * @target_list: list to add work to * * Adds the work to the specified list. Asserts that work * is not already on a list. * * Requires the proc->inner_lock to be held. */ static void binder_enqueue_work_ilocked(struct binder_work *work, struct list_head *target_list) { BUG_ON(target_list == NULL); BUG_ON(work->entry.next && !list_empty(&work->entry)); list_add_tail(&work->entry, target_list); } /** * binder_enqueue_deferred_thread_work_ilocked() - Add deferred thread work * @thread: thread to queue work to * @work: struct binder_work to add to list * * Adds the work to the todo list of the thread. Doesn't set the process_todo * flag, which means that (if it wasn't already set) the thread will go to * sleep without handling this work when it calls read. * * Requires the proc->inner_lock to be held. */ static void binder_enqueue_deferred_thread_work_ilocked(struct binder_thread *thread, struct binder_work *work) { WARN_ON(!list_empty(&thread->waiting_thread_node)); binder_enqueue_work_ilocked(work, &thread->todo); } /** * binder_enqueue_thread_work_ilocked() - Add an item to the thread work list * @thread: thread to queue work to * @work: struct binder_work to add to list * * Adds the work to the todo list of the thread, and enables processing * of the todo queue. * * Requires the proc->inner_lock to be held. */ static void binder_enqueue_thread_work_ilocked(struct binder_thread *thread, struct binder_work *work) { WARN_ON(!list_empty(&thread->waiting_thread_node)); binder_enqueue_work_ilocked(work, &thread->todo); /* (e)poll-based threads require an explicit wakeup signal when * queuing their own work; they rely on these events to consume * messages without I/O block. Without it, threads risk waiting * indefinitely without handling the work. */ if (thread->looper & BINDER_LOOPER_STATE_POLL && thread->pid == current->pid && !thread->process_todo) wake_up_interruptible_sync(&thread->wait); thread->process_todo = true; } /** * binder_enqueue_thread_work() - Add an item to the thread work list * @thread: thread to queue work to * @work: struct binder_work to add to list * * Adds the work to the todo list of the thread, and enables processing * of the todo queue. */ static void binder_enqueue_thread_work(struct binder_thread *thread, struct binder_work *work) { binder_inner_proc_lock(thread->proc); binder_enqueue_thread_work_ilocked(thread, work); binder_inner_proc_unlock(thread->proc); } static void binder_dequeue_work_ilocked(struct binder_work *work) { list_del_init(&work->entry); } /** * binder_dequeue_work() - Removes an item from the work list * @proc: binder_proc associated with list * @work: struct binder_work to remove from list * * Removes the specified work item from whatever list it is on. * Can safely be called if work is not on any list. */ static void binder_dequeue_work(struct binder_proc *proc, struct binder_work *work) { binder_inner_proc_lock(proc); binder_dequeue_work_ilocked(work); binder_inner_proc_unlock(proc); } static struct binder_work *binder_dequeue_work_head_ilocked( struct list_head *list) { struct binder_work *w; w = list_first_entry_or_null(list, struct binder_work, entry); if (w) list_del_init(&w->entry); return w; } static void binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer); static void binder_free_thread(struct binder_thread *thread); static void binder_free_proc(struct binder_proc *proc); static void binder_inc_node_tmpref_ilocked(struct binder_node *node); static bool binder_has_work_ilocked(struct binder_thread *thread, bool do_proc_work) { return thread->process_todo || thread->looper_need_return || (do_proc_work && !binder_worklist_empty_ilocked(&thread->proc->todo)); } static bool binder_has_work(struct binder_thread *thread, bool do_proc_work) { bool has_work; binder_inner_proc_lock(thread->proc); has_work = binder_has_work_ilocked(thread, do_proc_work); binder_inner_proc_unlock(thread->proc); return has_work; } static bool binder_available_for_proc_work_ilocked(struct binder_thread *thread) { return !thread->transaction_stack && binder_worklist_empty_ilocked(&thread->todo); } static void binder_wakeup_poll_threads_ilocked(struct binder_proc *proc, bool sync) { struct rb_node *n; struct binder_thread *thread; for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) { thread = rb_entry(n, struct binder_thread, rb_node); if (thread->looper & BINDER_LOOPER_STATE_POLL && binder_available_for_proc_work_ilocked(thread)) { if (sync) wake_up_interruptible_sync(&thread->wait); else wake_up_interruptible(&thread->wait); } } } /** * binder_select_thread_ilocked() - selects a thread for doing proc work. * @proc: process to select a thread from * * Note that calling this function moves the thread off the waiting_threads * list, so it can only be woken up by the caller of this function, or a * signal. Therefore, callers *should* always wake up the thread this function * returns. * * Return: If there's a thread currently waiting for process work, * returns that thread. Otherwise returns NULL. */ static struct binder_thread * binder_select_thread_ilocked(struct binder_proc *proc) { struct binder_thread *thread; assert_spin_locked(&proc->inner_lock); thread = list_first_entry_or_null(&proc->waiting_threads, struct binder_thread, waiting_thread_node); if (thread) list_del_init(&thread->waiting_thread_node); return thread; } /** * binder_wakeup_thread_ilocked() - wakes up a thread for doing proc work. * @proc: process to wake up a thread in * @thread: specific thread to wake-up (may be NULL) * @sync: whether to do a synchronous wake-up * * This function wakes up a thread in the @proc process. * The caller may provide a specific thread to wake-up in * the @thread parameter. If @thread is NULL, this function * will wake up threads that have called poll(). * * Note that for this function to work as expected, callers * should first call binder_select_thread() to find a thread * to handle the work (if they don't have a thread already), * and pass the result into the @thread parameter. */ static void binder_wakeup_thread_ilocked(struct binder_proc *proc, struct binder_thread *thread, bool sync) { assert_spin_locked(&proc->inner_lock); if (thread) { if (sync) wake_up_interruptible_sync(&thread->wait); else wake_up_interruptible(&thread->wait); return; } /* Didn't find a thread waiting for proc work; this can happen * in two scenarios: * 1. All threads are busy handling transactions * In that case, one of those threads should call back into * the kernel driver soon and pick up this work. * 2. Threads are using the (e)poll interface, in which case * they may be blocked on the waitqueue without having been * added to waiting_threads. For this case, we just iterate * over all threads not handling transaction work, and * wake them all up. We wake all because we don't know whether * a thread that called into (e)poll is handling non-binder * work currently. */ binder_wakeup_poll_threads_ilocked(proc, sync); } static void binder_wakeup_proc_ilocked(struct binder_proc *proc) { struct binder_thread *thread = binder_select_thread_ilocked(proc); binder_wakeup_thread_ilocked(proc, thread, /* sync = */false); } static void binder_set_nice(long nice) { long min_nice; if (can_nice(current, nice)) { set_user_nice(current, nice); return; } min_nice = rlimit_to_nice(rlimit(RLIMIT_NICE)); binder_debug(BINDER_DEBUG_PRIORITY_CAP, "%d: nice value %ld not allowed use %ld instead\n", current->pid, nice, min_nice); set_user_nice(current, min_nice); if (min_nice <= MAX_NICE) return; binder_user_error("%d RLIMIT_NICE not set\n", current->pid); } static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc, binder_uintptr_t ptr) { struct rb_node *n = proc->nodes.rb_node; struct binder_node *node; assert_spin_locked(&proc->inner_lock); while (n) { node = rb_entry(n, struct binder_node, rb_node); if (ptr < node->ptr) n = n->rb_left; else if (ptr > node->ptr) n = n->rb_right; else { /* * take an implicit weak reference * to ensure node stays alive until * call to binder_put_node() */ binder_inc_node_tmpref_ilocked(node); return node; } } return NULL; } static struct binder_node *binder_get_node(struct binder_proc *proc, binder_uintptr_t ptr) { struct binder_node *node; binder_inner_proc_lock(proc); node = binder_get_node_ilocked(proc, ptr); binder_inner_proc_unlock(proc); return node; } static struct binder_node *binder_init_node_ilocked( struct binder_proc *proc, struct binder_node *new_node, struct flat_binder_object *fp) { struct rb_node **p = &proc->nodes.rb_node; struct rb_node *parent = NULL; struct binder_node *node; binder_uintptr_t ptr = fp ? fp->binder : 0; binder_uintptr_t cookie = fp ? fp->cookie : 0; __u32 flags = fp ? fp->flags : 0; assert_spin_locked(&proc->inner_lock); while (*p) { parent = *p; node = rb_entry(parent, struct binder_node, rb_node); if (ptr < node->ptr) p = &(*p)->rb_left; else if (ptr > node->ptr) p = &(*p)->rb_right; else { /* * A matching node is already in * the rb tree. Abandon the init * and return it. */ binder_inc_node_tmpref_ilocked(node); return node; } } node = new_node; binder_stats_created(BINDER_STAT_NODE); node->tmp_refs++; rb_link_node(&node->rb_node, parent, p); rb_insert_color(&node->rb_node, &proc->nodes); node->debug_id = atomic_inc_return(&binder_last_id); node->proc = proc; node->ptr = ptr; node->cookie = cookie; node->work.type = BINDER_WORK_NODE; node->min_priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK; node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); node->txn_security_ctx = !!(flags & FLAT_BINDER_FLAG_TXN_SECURITY_CTX); spin_lock_init(&node->lock); INIT_LIST_HEAD(&node->work.entry); INIT_LIST_HEAD(&node->async_todo); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d:%d node %d u%016llx c%016llx created\n", proc->pid, current->pid, node->debug_id, (u64)node->ptr, (u64)node->cookie); return node; } static struct binder_node *binder_new_node(struct binder_proc *proc, struct flat_binder_object *fp) { struct binder_node *node; struct binder_node *new_node = kzalloc(sizeof(*node), GFP_KERNEL); if (!new_node) return NULL; binder_inner_proc_lock(proc); node = binder_init_node_ilocked(proc, new_node, fp); binder_inner_proc_unlock(proc); if (node != new_node) /* * The node was already added by another thread */ kfree(new_node); return node; } static void binder_free_node(struct binder_node *node) { kfree(node); binder_stats_deleted(BINDER_STAT_NODE); } static int binder_inc_node_nilocked(struct binder_node *node, int strong, int internal, struct list_head *target_list) { struct binder_proc *proc = node->proc; assert_spin_locked(&node->lock); if (proc) assert_spin_locked(&proc->inner_lock); if (strong) { if (internal) { if (target_list == NULL && node->internal_strong_refs == 0 && !(node->proc && node == node->proc->context->binder_context_mgr_node && node->has_strong_ref)) { pr_err("invalid inc strong node for %d\n", node->debug_id); return -EINVAL; } node->internal_strong_refs++; } else node->local_strong_refs++; if (!node->has_strong_ref && target_list) { struct binder_thread *thread = container_of(target_list, struct binder_thread, todo); binder_dequeue_work_ilocked(&node->work); BUG_ON(&thread->todo != target_list); binder_enqueue_deferred_thread_work_ilocked(thread, &node->work); } } else { if (!internal) node->local_weak_refs++; if (!node->has_weak_ref && list_empty(&node->work.entry)) { if (target_list == NULL) { pr_err("invalid inc weak node for %d\n", node->debug_id); return -EINVAL; } /* * See comment above */ binder_enqueue_work_ilocked(&node->work, target_list); } } return 0; } static int binder_inc_node(struct binder_node *node, int strong, int internal, struct list_head *target_list) { int ret; binder_node_inner_lock(node); ret = binder_inc_node_nilocked(node, strong, internal, target_list); binder_node_inner_unlock(node); return ret; } static bool binder_dec_node_nilocked(struct binder_node *node, int strong, int internal) { struct binder_proc *proc = node->proc; assert_spin_locked(&node->lock); if (proc) assert_spin_locked(&proc->inner_lock); if (strong) { if (internal) node->internal_strong_refs--; else node->local_strong_refs--; if (node->local_strong_refs || node->internal_strong_refs) return false; } else { if (!internal) node->local_weak_refs--; if (node->local_weak_refs || node->tmp_refs || !hlist_empty(&node->refs)) return false; } if (proc && (node->has_strong_ref || node->has_weak_ref)) { if (list_empty(&node->work.entry)) { binder_enqueue_work_ilocked(&node->work, &proc->todo); binder_wakeup_proc_ilocked(proc); } } else { if (hlist_empty(&node->refs) && !node->local_strong_refs && !node->local_weak_refs && !node->tmp_refs) { if (proc) { binder_dequeue_work_ilocked(&node->work); rb_erase(&node->rb_node, &proc->nodes); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "refless node %d deleted\n", node->debug_id); } else { BUG_ON(!list_empty(&node->work.entry)); spin_lock(&binder_dead_nodes_lock); /* * tmp_refs could have changed so * check it again */ if (node->tmp_refs) { spin_unlock(&binder_dead_nodes_lock); return false; } hlist_del(&node->dead_node); spin_unlock(&binder_dead_nodes_lock); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "dead node %d deleted\n", node->debug_id); } return true; } } return false; } static void binder_dec_node(struct binder_node *node, int strong, int internal) { bool free_node; binder_node_inner_lock(node); free_node = binder_dec_node_nilocked(node, strong, internal); binder_node_inner_unlock(node); if (free_node) binder_free_node(node); } static void binder_inc_node_tmpref_ilocked(struct binder_node *node) { /* * No call to binder_inc_node() is needed since we * don't need to inform userspace of any changes to * tmp_refs */ node->tmp_refs++; } /** * binder_inc_node_tmpref() - take a temporary reference on node * @node: node to reference * * Take reference on node to prevent the node from being freed * while referenced only by a local variable. The inner lock is * needed to serialize with the node work on the queue (which * isn't needed after the node is dead). If the node is dead * (node->proc is NULL), use binder_dead_nodes_lock to protect * node->tmp_refs against dead-node-only cases where the node * lock cannot be acquired (eg traversing the dead node list to * print nodes) */ static void binder_inc_node_tmpref(struct binder_node *node) { binder_node_lock(node); if (node->proc) binder_inner_proc_lock(node->proc); else spin_lock(&binder_dead_nodes_lock); binder_inc_node_tmpref_ilocked(node); if (node->proc) binder_inner_proc_unlock(node->proc); else spin_unlock(&binder_dead_nodes_lock); binder_node_unlock(node); } /** * binder_dec_node_tmpref() - remove a temporary reference on node * @node: node to reference * * Release temporary reference on node taken via binder_inc_node_tmpref() */ static void binder_dec_node_tmpref(struct binder_node *node) { bool free_node; binder_node_inner_lock(node); if (!node->proc) spin_lock(&binder_dead_nodes_lock); else __acquire(&binder_dead_nodes_lock); node->tmp_refs--; BUG_ON(node->tmp_refs < 0); if (!node->proc) spin_unlock(&binder_dead_nodes_lock); else __release(&binder_dead_nodes_lock); /* * Call binder_dec_node() to check if all refcounts are 0 * and cleanup is needed. Calling with strong=0 and internal=1 * causes no actual reference to be released in binder_dec_node(). * If that changes, a change is needed here too. */ free_node = binder_dec_node_nilocked(node, 0, 1); binder_node_inner_unlock(node); if (free_node) binder_free_node(node); } static void binder_put_node(struct binder_node *node) { binder_dec_node_tmpref(node); } static struct binder_ref *binder_get_ref_olocked(struct binder_proc *proc, u32 desc, bool need_strong_ref) { struct rb_node *n = proc->refs_by_desc.rb_node; struct binder_ref *ref; while (n) { ref = rb_entry(n, struct binder_ref, rb_node_desc); if (desc < ref->data.desc) { n = n->rb_left; } else if (desc > ref->data.desc) { n = n->rb_right; } else if (need_strong_ref && !ref->data.strong) { binder_user_error("tried to use weak ref as strong ref\n"); return NULL; } else { return ref; } } return NULL; } /* Find the smallest unused descriptor the "slow way" */ static u32 slow_desc_lookup_olocked(struct binder_proc *proc, u32 offset) { struct binder_ref *ref; struct rb_node *n; u32 desc; desc = offset; for (n = rb_first(&proc->refs_by_desc); n; n = rb_next(n)) { ref = rb_entry(n, struct binder_ref, rb_node_desc); if (ref->data.desc > desc) break; desc = ref->data.desc + 1; } return desc; } /* * Find an available reference descriptor ID. The proc->outer_lock might * be released in the process, in which case -EAGAIN is returned and the * @desc should be considered invalid. */ static int get_ref_desc_olocked(struct binder_proc *proc, struct binder_node *node, u32 *desc) { struct dbitmap *dmap = &proc->dmap; unsigned int nbits, offset; unsigned long *new, bit; /* 0 is reserved for the context manager */ offset = (node == proc->context->binder_context_mgr_node) ? 0 : 1; if (!dbitmap_enabled(dmap)) { *desc = slow_desc_lookup_olocked(proc, offset); return 0; } if (dbitmap_acquire_next_zero_bit(dmap, offset, &bit) == 0) { *desc = bit; return 0; } /* * The dbitmap is full and needs to grow. The proc->outer_lock * is briefly released to allocate the new bitmap safely. */ nbits = dbitmap_grow_nbits(dmap); binder_proc_unlock(proc); new = bitmap_zalloc(nbits, GFP_KERNEL); binder_proc_lock(proc); dbitmap_grow(dmap, new, nbits); return -EAGAIN; } /** * binder_get_ref_for_node_olocked() - get the ref associated with given node * @proc: binder_proc that owns the ref * @node: binder_node of target * @new_ref: newly allocated binder_ref to be initialized or %NULL * * Look up the ref for the given node and return it if it exists * * If it doesn't exist and the caller provides a newly allocated * ref, initialize the fields of the newly allocated ref and insert * into the given proc rb_trees and node refs list. * * Return: the ref for node. It is possible that another thread * allocated/initialized the ref first in which case the * returned ref would be different than the passed-in * new_ref. new_ref must be kfree'd by the caller in * this case. */ static struct binder_ref *binder_get_ref_for_node_olocked( struct binder_proc *proc, struct binder_node *node, struct binder_ref *new_ref) { struct binder_ref *ref; struct rb_node *parent; struct rb_node **p; u32 desc; retry: p = &proc->refs_by_node.rb_node; parent = NULL; while (*p) { parent = *p; ref = rb_entry(parent, struct binder_ref, rb_node_node); if (node < ref->node) p = &(*p)->rb_left; else if (node > ref->node) p = &(*p)->rb_right; else return ref; } if (!new_ref) return NULL; /* might release the proc->outer_lock */ if (get_ref_desc_olocked(proc, node, &desc) == -EAGAIN) goto retry; binder_stats_created(BINDER_STAT_REF); new_ref->data.debug_id = atomic_inc_return(&binder_last_id); new_ref->proc = proc; new_ref->node = node; rb_link_node(&new_ref->rb_node_node, parent, p); rb_insert_color(&new_ref->rb_node_node, &proc->refs_by_node); new_ref->data.desc = desc; p = &proc->refs_by_desc.rb_node; while (*p) { parent = *p; ref = rb_entry(parent, struct binder_ref, rb_node_desc); if (new_ref->data.desc < ref->data.desc) p = &(*p)->rb_left; else if (new_ref->data.desc > ref->data.desc) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new_ref->rb_node_desc, parent, p); rb_insert_color(&new_ref->rb_node_desc, &proc->refs_by_desc); binder_node_lock(node); hlist_add_head(&new_ref->node_entry, &node->refs); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d new ref %d desc %d for node %d\n", proc->pid, new_ref->data.debug_id, new_ref->data.desc, node->debug_id); binder_node_unlock(node); return new_ref; } static void binder_cleanup_ref_olocked(struct binder_ref *ref) { struct dbitmap *dmap = &ref->proc->dmap; bool delete_node = false; binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d delete ref %d desc %d for node %d\n", ref->proc->pid, ref->data.debug_id, ref->data.desc, ref->node->debug_id); if (dbitmap_enabled(dmap)) dbitmap_clear_bit(dmap, ref->data.desc); rb_erase(&ref->rb_node_desc, &ref->proc->refs_by_desc); rb_erase(&ref->rb_node_node, &ref->proc->refs_by_node); binder_node_inner_lock(ref->node); if (ref->data.strong) binder_dec_node_nilocked(ref->node, 1, 1); hlist_del(&ref->node_entry); delete_node = binder_dec_node_nilocked(ref->node, 0, 1); binder_node_inner_unlock(ref->node); /* * Clear ref->node unless we want the caller to free the node */ if (!delete_node) { /* * The caller uses ref->node to determine * whether the node needs to be freed. Clear * it since the node is still alive. */ ref->node = NULL; } if (ref->death) { binder_debug(BINDER_DEBUG_DEAD_BINDER, "%d delete ref %d desc %d has death notification\n", ref->proc->pid, ref->data.debug_id, ref->data.desc); binder_dequeue_work(ref->proc, &ref->death->work); binder_stats_deleted(BINDER_STAT_DEATH); } if (ref->freeze) { binder_dequeue_work(ref->proc, &ref->freeze->work); binder_stats_deleted(BINDER_STAT_FREEZE); } binder_stats_deleted(BINDER_STAT_REF); } /** * binder_inc_ref_olocked() - increment the ref for given handle * @ref: ref to be incremented * @strong: if true, strong increment, else weak * @target_list: list to queue node work on * * Increment the ref. @ref->proc->outer_lock must be held on entry * * Return: 0, if successful, else errno */ static int binder_inc_ref_olocked(struct binder_ref *ref, int strong, struct list_head *target_list) { int ret; if (strong) { if (ref->data.strong == 0) { ret = binder_inc_node(ref->node, 1, 1, target_list); if (ret) return ret; } ref->data.strong++; } else { if (ref->data.weak == 0) { ret = binder_inc_node(ref->node, 0, 1, target_list); if (ret) return ret; } ref->data.weak++; } return 0; } /** * binder_dec_ref_olocked() - dec the ref for given handle * @ref: ref to be decremented * @strong: if true, strong decrement, else weak * * Decrement the ref. * * Return: %true if ref is cleaned up and ready to be freed. */ static bool binder_dec_ref_olocked(struct binder_ref *ref, int strong) { if (strong) { if (ref->data.strong == 0) { binder_user_error("%d invalid dec strong, ref %d desc %d s %d w %d\n", ref->proc->pid, ref->data.debug_id, ref->data.desc, ref->data.strong, ref->data.weak); return false; } ref->data.strong--; if (ref->data.strong == 0) binder_dec_node(ref->node, strong, 1); } else { if (ref->data.weak == 0) { binder_user_error("%d invalid dec weak, ref %d desc %d s %d w %d\n", ref->proc->pid, ref->data.debug_id, ref->data.desc, ref->data.strong, ref->data.weak); return false; } ref->data.weak--; } if (ref->data.strong == 0 && ref->data.weak == 0) { binder_cleanup_ref_olocked(ref); return true; } return false; } /** * binder_get_node_from_ref() - get the node from the given proc/desc * @proc: proc containing the ref * @desc: the handle associated with the ref * @need_strong_ref: if true, only return node if ref is strong * @rdata: the id/refcount data for the ref * * Given a proc and ref handle, return the associated binder_node * * Return: a binder_node or NULL if not found or not strong when strong required */ static struct binder_node *binder_get_node_from_ref( struct binder_proc *proc, u32 desc, bool need_strong_ref, struct binder_ref_data *rdata) { struct binder_node *node; struct binder_ref *ref; binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, desc, need_strong_ref); if (!ref) goto err_no_ref; node = ref->node; /* * Take an implicit reference on the node to ensure * it stays alive until the call to binder_put_node() */ binder_inc_node_tmpref(node); if (rdata) *rdata = ref->data; binder_proc_unlock(proc); return node; err_no_ref: binder_proc_unlock(proc); return NULL; } /** * binder_free_ref() - free the binder_ref * @ref: ref to free * * Free the binder_ref. Free the binder_node indicated by ref->node * (if non-NULL) and the binder_ref_death indicated by ref->death. */ static void binder_free_ref(struct binder_ref *ref) { if (ref->node) binder_free_node(ref->node); kfree(ref->death); kfree(ref->freeze); kfree(ref); } /* shrink descriptor bitmap if needed */ static void try_shrink_dmap(struct binder_proc *proc) { unsigned long *new; int nbits; binder_proc_lock(proc); nbits = dbitmap_shrink_nbits(&proc->dmap); binder_proc_unlock(proc); if (!nbits) return; new = bitmap_zalloc(nbits, GFP_KERNEL); binder_proc_lock(proc); dbitmap_shrink(&proc->dmap, new, nbits); binder_proc_unlock(proc); } /** * binder_update_ref_for_handle() - inc/dec the ref for given handle * @proc: proc containing the ref * @desc: the handle associated with the ref * @increment: true=inc reference, false=dec reference * @strong: true=strong reference, false=weak reference * @rdata: the id/refcount data for the ref * * Given a proc and ref handle, increment or decrement the ref * according to "increment" arg. * * Return: 0 if successful, else errno */ static int binder_update_ref_for_handle(struct binder_proc *proc, uint32_t desc, bool increment, bool strong, struct binder_ref_data *rdata) { int ret = 0; struct binder_ref *ref; bool delete_ref = false; binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, desc, strong); if (!ref) { ret = -EINVAL; goto err_no_ref; } if (increment) ret = binder_inc_ref_olocked(ref, strong, NULL); else delete_ref = binder_dec_ref_olocked(ref, strong); if (rdata) *rdata = ref->data; binder_proc_unlock(proc); if (delete_ref) { binder_free_ref(ref); try_shrink_dmap(proc); } return ret; err_no_ref: binder_proc_unlock(proc); return ret; } /** * binder_dec_ref_for_handle() - dec the ref for given handle * @proc: proc containing the ref * @desc: the handle associated with the ref * @strong: true=strong reference, false=weak reference * @rdata: the id/refcount data for the ref * * Just calls binder_update_ref_for_handle() to decrement the ref. * * Return: 0 if successful, else errno */ static int binder_dec_ref_for_handle(struct binder_proc *proc, uint32_t desc, bool strong, struct binder_ref_data *rdata) { return binder_update_ref_for_handle(proc, desc, false, strong, rdata); } /** * binder_inc_ref_for_node() - increment the ref for given proc/node * @proc: proc containing the ref * @node: target node * @strong: true=strong reference, false=weak reference * @target_list: worklist to use if node is incremented * @rdata: the id/refcount data for the ref * * Given a proc and node, increment the ref. Create the ref if it * doesn't already exist * * Return: 0 if successful, else errno */ static int binder_inc_ref_for_node(struct binder_proc *proc, struct binder_node *node, bool strong, struct list_head *target_list, struct binder_ref_data *rdata) { struct binder_ref *ref; struct binder_ref *new_ref = NULL; int ret = 0; binder_proc_lock(proc); ref = binder_get_ref_for_node_olocked(proc, node, NULL); if (!ref) { binder_proc_unlock(proc); new_ref = kzalloc(sizeof(*ref), GFP_KERNEL); if (!new_ref) return -ENOMEM; binder_proc_lock(proc); ref = binder_get_ref_for_node_olocked(proc, node, new_ref); } ret = binder_inc_ref_olocked(ref, strong, target_list); *rdata = ref->data; if (ret && ref == new_ref) { /* * Cleanup the failed reference here as the target * could now be dead and have already released its * references by now. Calling on the new reference * with strong=0 and a tmp_refs will not decrement * the node. The new_ref gets kfree'd below. */ binder_cleanup_ref_olocked(new_ref); ref = NULL; } binder_proc_unlock(proc); if (new_ref && ref != new_ref) /* * Another thread created the ref first so * free the one we allocated */ kfree(new_ref); return ret; } static void binder_pop_transaction_ilocked(struct binder_thread *target_thread, struct binder_transaction *t) { BUG_ON(!target_thread); assert_spin_locked(&target_thread->proc->inner_lock); BUG_ON(target_thread->transaction_stack != t); BUG_ON(target_thread->transaction_stack->from != target_thread); target_thread->transaction_stack = target_thread->transaction_stack->from_parent; t->from = NULL; } /** * binder_thread_dec_tmpref() - decrement thread->tmp_ref * @thread: thread to decrement * * A thread needs to be kept alive while being used to create or * handle a transaction. binder_get_txn_from() is used to safely * extract t->from from a binder_transaction and keep the thread * indicated by t->from from being freed. When done with that * binder_thread, this function is called to decrement the * tmp_ref and free if appropriate (thread has been released * and no transaction being processed by the driver) */ static void binder_thread_dec_tmpref(struct binder_thread *thread) { /* * atomic is used to protect the counter value while * it cannot reach zero or thread->is_dead is false */ binder_inner_proc_lock(thread->proc); atomic_dec(&thread->tmp_ref); if (thread->is_dead && !atomic_read(&thread->tmp_ref)) { binder_inner_proc_unlock(thread->proc); binder_free_thread(thread); return; } binder_inner_proc_unlock(thread->proc); } /** * binder_proc_dec_tmpref() - decrement proc->tmp_ref * @proc: proc to decrement * * A binder_proc needs to be kept alive while being used to create or * handle a transaction. proc->tmp_ref is incremented when * creating a new transaction or the binder_proc is currently in-use * by threads that are being released. When done with the binder_proc, * this function is called to decrement the counter and free the * proc if appropriate (proc has been released, all threads have * been released and not currently in-use to process a transaction). */ static void binder_proc_dec_tmpref(struct binder_proc *proc) { binder_inner_proc_lock(proc); proc->tmp_ref--; if (proc->is_dead && RB_EMPTY_ROOT(&proc->threads) && !proc->tmp_ref) { binder_inner_proc_unlock(proc); binder_free_proc(proc); return; } binder_inner_proc_unlock(proc); } /** * binder_get_txn_from() - safely extract the "from" thread in transaction * @t: binder transaction for t->from * * Atomically return the "from" thread and increment the tmp_ref * count for the thread to ensure it stays alive until * binder_thread_dec_tmpref() is called. * * Return: the value of t->from */ static struct binder_thread *binder_get_txn_from( struct binder_transaction *t) { struct binder_thread *from; spin_lock(&t->lock); from = t->from; if (from) atomic_inc(&from->tmp_ref); spin_unlock(&t->lock); return from; } /** * binder_get_txn_from_and_acq_inner() - get t->from and acquire inner lock * @t: binder transaction for t->from * * Same as binder_get_txn_from() except it also acquires the proc->inner_lock * to guarantee that the thread cannot be released while operating on it. * The caller must call binder_inner_proc_unlock() to release the inner lock * as well as call binder_dec_thread_txn() to release the reference. * * Return: the value of t->from */ static struct binder_thread *binder_get_txn_from_and_acq_inner( struct binder_transaction *t) __acquires(&t->from->proc->inner_lock) { struct binder_thread *from; from = binder_get_txn_from(t); if (!from) { __acquire(&from->proc->inner_lock); return NULL; } binder_inner_proc_lock(from->proc); if (t->from) { BUG_ON(from != t->from); return from; } binder_inner_proc_unlock(from->proc); __acquire(&from->proc->inner_lock); binder_thread_dec_tmpref(from); return NULL; } /** * binder_free_txn_fixups() - free unprocessed fd fixups * @t: binder transaction for t->from * * If the transaction is being torn down prior to being * processed by the target process, free all of the * fd fixups and fput the file structs. It is safe to * call this function after the fixups have been * processed -- in that case, the list will be empty. */ static void binder_free_txn_fixups(struct binder_transaction *t) { struct binder_txn_fd_fixup *fixup, *tmp; list_for_each_entry_safe(fixup, tmp, &t->fd_fixups, fixup_entry) { fput(fixup->file); if (fixup->target_fd >= 0) put_unused_fd(fixup->target_fd); list_del(&fixup->fixup_entry); kfree(fixup); } } static void binder_txn_latency_free(struct binder_transaction *t) { int from_proc, from_thread, to_proc, to_thread; spin_lock(&t->lock); from_proc = t->from ? t->from->proc->pid : 0; from_thread = t->from ? t->from->pid : 0; to_proc = t->to_proc ? t->to_proc->pid : 0; to_thread = t->to_thread ? t->to_thread->pid : 0; spin_unlock(&t->lock); trace_binder_txn_latency_free(t, from_proc, from_thread, to_proc, to_thread); } static void binder_free_transaction(struct binder_transaction *t) { struct binder_proc *target_proc = t->to_proc; if (target_proc) { binder_inner_proc_lock(target_proc); target_proc->outstanding_txns--; if (target_proc->outstanding_txns < 0) pr_warn("%s: Unexpected outstanding_txns %d\n", __func__, target_proc->outstanding_txns); if (!target_proc->outstanding_txns && target_proc->is_frozen) wake_up_interruptible_all(&target_proc->freeze_wait); if (t->buffer) t->buffer->transaction = NULL; binder_inner_proc_unlock(target_proc); } if (trace_binder_txn_latency_free_enabled()) binder_txn_latency_free(t); /* * If the transaction has no target_proc, then * t->buffer->transaction has already been cleared. */ binder_free_txn_fixups(t); kfree(t); binder_stats_deleted(BINDER_STAT_TRANSACTION); } static void binder_send_failed_reply(struct binder_transaction *t, uint32_t error_code) { struct binder_thread *target_thread; struct binder_transaction *next; BUG_ON(t->flags & TF_ONE_WAY); while (1) { target_thread = binder_get_txn_from_and_acq_inner(t); if (target_thread) { binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "send failed reply for transaction %d to %d:%d\n", t->debug_id, target_thread->proc->pid, target_thread->pid); binder_pop_transaction_ilocked(target_thread, t); if (target_thread->reply_error.cmd == BR_OK) { target_thread->reply_error.cmd = error_code; binder_enqueue_thread_work_ilocked( target_thread, &target_thread->reply_error.work); wake_up_interruptible(&target_thread->wait); } else { /* * Cannot get here for normal operation, but * we can if multiple synchronous transactions * are sent without blocking for responses. * Just ignore the 2nd error in this case. */ pr_warn("Unexpected reply error: %u\n", target_thread->reply_error.cmd); } binder_inner_proc_unlock(target_thread->proc); binder_thread_dec_tmpref(target_thread); binder_free_transaction(t); return; } __release(&target_thread->proc->inner_lock); next = t->from_parent; binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "send failed reply for transaction %d, target dead\n", t->debug_id); binder_free_transaction(t); if (next == NULL) { binder_debug(BINDER_DEBUG_DEAD_BINDER, "reply failed, no target thread at root\n"); return; } t = next; binder_debug(BINDER_DEBUG_DEAD_BINDER, "reply failed, no target thread -- retry %d\n", t->debug_id); } } /** * binder_cleanup_transaction() - cleans up undelivered transaction * @t: transaction that needs to be cleaned up * @reason: reason the transaction wasn't delivered * @error_code: error to return to caller (if synchronous call) */ static void binder_cleanup_transaction(struct binder_transaction *t, const char *reason, uint32_t error_code) { if (t->buffer->target_node && !(t->flags & TF_ONE_WAY)) { binder_send_failed_reply(t, error_code); } else { binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered transaction %d, %s\n", t->debug_id, reason); binder_free_transaction(t); } } /** * binder_get_object() - gets object and checks for valid metadata * @proc: binder_proc owning the buffer * @u: sender's user pointer to base of buffer * @buffer: binder_buffer that we're parsing. * @offset: offset in the @buffer at which to validate an object. * @object: struct binder_object to read into * * Copy the binder object at the given offset into @object. If @u is * provided then the copy is from the sender's buffer. If not, then * it is copied from the target's @buffer. * * Return: If there's a valid metadata object at @offset, the * size of that object. Otherwise, it returns zero. The object * is read into the struct binder_object pointed to by @object. */ static size_t binder_get_object(struct binder_proc *proc, const void __user *u, struct binder_buffer *buffer, unsigned long offset, struct binder_object *object) { size_t read_size; struct binder_object_header *hdr; size_t object_size = 0; read_size = min_t(size_t, sizeof(*object), buffer->data_size - offset); if (offset > buffer->data_size || read_size < sizeof(*hdr) || !IS_ALIGNED(offset, sizeof(u32))) return 0; if (u) { if (copy_from_user(object, u + offset, read_size)) return 0; } else { if (binder_alloc_copy_from_buffer(&proc->alloc, object, buffer, offset, read_size)) return 0; } /* Ok, now see if we read a complete object. */ hdr = &object->hdr; switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: object_size = sizeof(struct flat_binder_object); break; case BINDER_TYPE_FD: object_size = sizeof(struct binder_fd_object); break; case BINDER_TYPE_PTR: object_size = sizeof(struct binder_buffer_object); break; case BINDER_TYPE_FDA: object_size = sizeof(struct binder_fd_array_object); break; default: return 0; } if (offset <= buffer->data_size - object_size && buffer->data_size >= object_size) return object_size; else return 0; } /** * binder_validate_ptr() - validates binder_buffer_object in a binder_buffer. * @proc: binder_proc owning the buffer * @b: binder_buffer containing the object * @object: struct binder_object to read into * @index: index in offset array at which the binder_buffer_object is * located * @start_offset: points to the start of the offset array * @object_offsetp: offset of @object read from @b * @num_valid: the number of valid offsets in the offset array * * Return: If @index is within the valid range of the offset array * described by @start and @num_valid, and if there's a valid * binder_buffer_object at the offset found in index @index * of the offset array, that object is returned. Otherwise, * %NULL is returned. * Note that the offset found in index @index itself is not * verified; this function assumes that @num_valid elements * from @start were previously verified to have valid offsets. * If @object_offsetp is non-NULL, then the offset within * @b is written to it. */ static struct binder_buffer_object *binder_validate_ptr( struct binder_proc *proc, struct binder_buffer *b, struct binder_object *object, binder_size_t index, binder_size_t start_offset, binder_size_t *object_offsetp, binder_size_t num_valid) { size_t object_size; binder_size_t object_offset; unsigned long buffer_offset; if (index >= num_valid) return NULL; buffer_offset = start_offset + sizeof(binder_size_t) * index; if (binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, b, buffer_offset, sizeof(object_offset))) return NULL; object_size = binder_get_object(proc, NULL, b, object_offset, object); if (!object_size || object->hdr.type != BINDER_TYPE_PTR) return NULL; if (object_offsetp) *object_offsetp = object_offset; return &object->bbo; } /** * binder_validate_fixup() - validates pointer/fd fixups happen in order. * @proc: binder_proc owning the buffer * @b: transaction buffer * @objects_start_offset: offset to start of objects buffer * @buffer_obj_offset: offset to binder_buffer_object in which to fix up * @fixup_offset: start offset in @buffer to fix up * @last_obj_offset: offset to last binder_buffer_object that we fixed * @last_min_offset: minimum fixup offset in object at @last_obj_offset * * Return: %true if a fixup in buffer @buffer at offset @offset is * allowed. * * For safety reasons, we only allow fixups inside a buffer to happen * at increasing offsets; additionally, we only allow fixup on the last * buffer object that was verified, or one of its parents. * * Example of what is allowed: * * A * B (parent = A, offset = 0) * C (parent = A, offset = 16) * D (parent = C, offset = 0) * E (parent = A, offset = 32) // min_offset is 16 (C.parent_offset) * * Examples of what is not allowed: * * Decreasing offsets within the same parent: * A * C (parent = A, offset = 16) * B (parent = A, offset = 0) // decreasing offset within A * * Referring to a parent that wasn't the last object or any of its parents: * A * B (parent = A, offset = 0) * C (parent = A, offset = 0) * C (parent = A, offset = 16) * D (parent = B, offset = 0) // B is not A or any of A's parents */ static bool binder_validate_fixup(struct binder_proc *proc, struct binder_buffer *b, binder_size_t objects_start_offset, binder_size_t buffer_obj_offset, binder_size_t fixup_offset, binder_size_t last_obj_offset, binder_size_t last_min_offset) { if (!last_obj_offset) { /* Nothing to fix up in */ return false; } while (last_obj_offset != buffer_obj_offset) { unsigned long buffer_offset; struct binder_object last_object; struct binder_buffer_object *last_bbo; size_t object_size = binder_get_object(proc, NULL, b, last_obj_offset, &last_object); if (object_size != sizeof(*last_bbo)) return false; last_bbo = &last_object.bbo; /* * Safe to retrieve the parent of last_obj, since it * was already previously verified by the driver. */ if ((last_bbo->flags & BINDER_BUFFER_FLAG_HAS_PARENT) == 0) return false; last_min_offset = last_bbo->parent_offset + sizeof(uintptr_t); buffer_offset = objects_start_offset + sizeof(binder_size_t) * last_bbo->parent; if (binder_alloc_copy_from_buffer(&proc->alloc, &last_obj_offset, b, buffer_offset, sizeof(last_obj_offset))) return false; } return (fixup_offset >= last_min_offset); } /** * struct binder_task_work_cb - for deferred close * * @twork: callback_head for task work * @file: file to close * * Structure to pass task work to be handled after * returning from binder_ioctl() via task_work_add(). */ struct binder_task_work_cb { struct callback_head twork; struct file *file; }; /** * binder_do_fd_close() - close list of file descriptors * @twork: callback head for task work * * It is not safe to call ksys_close() during the binder_ioctl() * function if there is a chance that binder's own file descriptor * might be closed. This is to meet the requirements for using * fdget() (see comments for __fget_light()). Therefore use * task_work_add() to schedule the close operation once we have * returned from binder_ioctl(). This function is a callback * for that mechanism and does the actual ksys_close() on the * given file descriptor. */ static void binder_do_fd_close(struct callback_head *twork) { struct binder_task_work_cb *twcb = container_of(twork, struct binder_task_work_cb, twork); fput(twcb->file); kfree(twcb); } /** * binder_deferred_fd_close() - schedule a close for the given file-descriptor * @fd: file-descriptor to close * * See comments in binder_do_fd_close(). This function is used to schedule * a file-descriptor to be closed after returning from binder_ioctl(). */ static void binder_deferred_fd_close(int fd) { struct binder_task_work_cb *twcb; twcb = kzalloc(sizeof(*twcb), GFP_KERNEL); if (!twcb) return; init_task_work(&twcb->twork, binder_do_fd_close); twcb->file = file_close_fd(fd); if (twcb->file) { // pin it until binder_do_fd_close(); see comments there get_file(twcb->file); filp_close(twcb->file, current->files); task_work_add(current, &twcb->twork, TWA_RESUME); } else { kfree(twcb); } } static void binder_transaction_buffer_release(struct binder_proc *proc, struct binder_thread *thread, struct binder_buffer *buffer, binder_size_t off_end_offset, bool is_failure) { int debug_id = buffer->debug_id; binder_size_t off_start_offset, buffer_offset; binder_debug(BINDER_DEBUG_TRANSACTION, "%d buffer release %d, size %zd-%zd, failed at %llx\n", proc->pid, buffer->debug_id, buffer->data_size, buffer->offsets_size, (unsigned long long)off_end_offset); if (buffer->target_node) binder_dec_node(buffer->target_node, 1, 0); off_start_offset = ALIGN(buffer->data_size, sizeof(void *)); for (buffer_offset = off_start_offset; buffer_offset < off_end_offset; buffer_offset += sizeof(binder_size_t)) { struct binder_object_header *hdr; size_t object_size = 0; struct binder_object object; binder_size_t object_offset; if (!binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, buffer, buffer_offset, sizeof(object_offset))) object_size = binder_get_object(proc, NULL, buffer, object_offset, &object); if (object_size == 0) { pr_err("transaction release %d bad object at offset %lld, size %zd\n", debug_id, (u64)object_offset, buffer->data_size); continue; } hdr = &object.hdr; switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { struct flat_binder_object *fp; struct binder_node *node; fp = to_flat_binder_object(hdr); node = binder_get_node(proc, fp->binder); if (node == NULL) { pr_err("transaction release %d bad node %016llx\n", debug_id, (u64)fp->binder); break; } binder_debug(BINDER_DEBUG_TRANSACTION, " node %d u%016llx\n", node->debug_id, (u64)node->ptr); binder_dec_node(node, hdr->type == BINDER_TYPE_BINDER, 0); binder_put_node(node); } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { struct flat_binder_object *fp; struct binder_ref_data rdata; int ret; fp = to_flat_binder_object(hdr); ret = binder_dec_ref_for_handle(proc, fp->handle, hdr->type == BINDER_TYPE_HANDLE, &rdata); if (ret) { pr_err("transaction release %d bad handle %d, ret = %d\n", debug_id, fp->handle, ret); break; } binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d\n", rdata.debug_id, rdata.desc); } break; case BINDER_TYPE_FD: { /* * No need to close the file here since user-space * closes it for successfully delivered * transactions. For transactions that weren't * delivered, the new fd was never allocated so * there is no need to close and the fput on the * file is done when the transaction is torn * down. */ } break; case BINDER_TYPE_PTR: /* * Nothing to do here, this will get cleaned up when the * transaction buffer gets freed */ break; case BINDER_TYPE_FDA: { struct binder_fd_array_object *fda; struct binder_buffer_object *parent; struct binder_object ptr_object; binder_size_t fda_offset; size_t fd_index; binder_size_t fd_buf_size; binder_size_t num_valid; if (is_failure) { /* * The fd fixups have not been applied so no * fds need to be closed. */ continue; } num_valid = (buffer_offset - off_start_offset) / sizeof(binder_size_t); fda = to_binder_fd_array_object(hdr); parent = binder_validate_ptr(proc, buffer, &ptr_object, fda->parent, off_start_offset, NULL, num_valid); if (!parent) { pr_err("transaction release %d bad parent offset\n", debug_id); continue; } fd_buf_size = sizeof(u32) * fda->num_fds; if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { pr_err("transaction release %d invalid number of fds (%lld)\n", debug_id, (u64)fda->num_fds); continue; } if (fd_buf_size > parent->length || fda->parent_offset > parent->length - fd_buf_size) { /* No space for all file descriptors here. */ pr_err("transaction release %d not enough space for %lld fds in buffer\n", debug_id, (u64)fda->num_fds); continue; } /* * the source data for binder_buffer_object is visible * to user-space and the @buffer element is the user * pointer to the buffer_object containing the fd_array. * Convert the address to an offset relative to * the base of the transaction buffer. */ fda_offset = parent->buffer - buffer->user_data + fda->parent_offset; for (fd_index = 0; fd_index < fda->num_fds; fd_index++) { u32 fd; int err; binder_size_t offset = fda_offset + fd_index * sizeof(fd); err = binder_alloc_copy_from_buffer( &proc->alloc, &fd, buffer, offset, sizeof(fd)); WARN_ON(err); if (!err) { binder_deferred_fd_close(fd); /* * Need to make sure the thread goes * back to userspace to complete the * deferred close */ if (thread) thread->looper_need_return = true; } } } break; default: pr_err("transaction release %d bad object type %x\n", debug_id, hdr->type); break; } } } /* Clean up all the objects in the buffer */ static inline void binder_release_entire_buffer(struct binder_proc *proc, struct binder_thread *thread, struct binder_buffer *buffer, bool is_failure) { binder_size_t off_end_offset; off_end_offset = ALIGN(buffer->data_size, sizeof(void *)); off_end_offset += buffer->offsets_size; binder_transaction_buffer_release(proc, thread, buffer, off_end_offset, is_failure); } static int binder_translate_binder(struct flat_binder_object *fp, struct binder_transaction *t, struct binder_thread *thread) { struct binder_node *node; struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_ref_data rdata; int ret = 0; node = binder_get_node(proc, fp->binder); if (!node) { node = binder_new_node(proc, fp); if (!node) return -ENOMEM; } if (fp->cookie != node->cookie) { binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n", proc->pid, thread->pid, (u64)fp->binder, node->debug_id, (u64)fp->cookie, (u64)node->cookie); ret = -EINVAL; goto done; } if (security_binder_transfer_binder(proc->cred, target_proc->cred)) { ret = -EPERM; goto done; } ret = binder_inc_ref_for_node(target_proc, node, fp->hdr.type == BINDER_TYPE_BINDER, &thread->todo, &rdata); if (ret) goto done; if (fp->hdr.type == BINDER_TYPE_BINDER) fp->hdr.type = BINDER_TYPE_HANDLE; else fp->hdr.type = BINDER_TYPE_WEAK_HANDLE; fp->binder = 0; fp->handle = rdata.desc; fp->cookie = 0; trace_binder_transaction_node_to_ref(t, node, &rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " node %d u%016llx -> ref %d desc %d\n", node->debug_id, (u64)node->ptr, rdata.debug_id, rdata.desc); done: binder_put_node(node); return ret; } static int binder_translate_handle(struct flat_binder_object *fp, struct binder_transaction *t, struct binder_thread *thread) { struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_node *node; struct binder_ref_data src_rdata; int ret = 0; node = binder_get_node_from_ref(proc, fp->handle, fp->hdr.type == BINDER_TYPE_HANDLE, &src_rdata); if (!node) { binder_user_error("%d:%d got transaction with invalid handle, %d\n", proc->pid, thread->pid, fp->handle); return -EINVAL; } if (security_binder_transfer_binder(proc->cred, target_proc->cred)) { ret = -EPERM; goto done; } binder_node_lock(node); if (node->proc == target_proc) { if (fp->hdr.type == BINDER_TYPE_HANDLE) fp->hdr.type = BINDER_TYPE_BINDER; else fp->hdr.type = BINDER_TYPE_WEAK_BINDER; fp->binder = node->ptr; fp->cookie = node->cookie; if (node->proc) binder_inner_proc_lock(node->proc); else __acquire(&node->proc->inner_lock); binder_inc_node_nilocked(node, fp->hdr.type == BINDER_TYPE_BINDER, 0, NULL); if (node->proc) binder_inner_proc_unlock(node->proc); else __release(&node->proc->inner_lock); trace_binder_transaction_ref_to_node(t, node, &src_rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d -> node %d u%016llx\n", src_rdata.debug_id, src_rdata.desc, node->debug_id, (u64)node->ptr); binder_node_unlock(node); } else { struct binder_ref_data dest_rdata; binder_node_unlock(node); ret = binder_inc_ref_for_node(target_proc, node, fp->hdr.type == BINDER_TYPE_HANDLE, NULL, &dest_rdata); if (ret) goto done; fp->binder = 0; fp->handle = dest_rdata.desc; fp->cookie = 0; trace_binder_transaction_ref_to_ref(t, node, &src_rdata, &dest_rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d -> ref %d desc %d (node %d)\n", src_rdata.debug_id, src_rdata.desc, dest_rdata.debug_id, dest_rdata.desc, node->debug_id); } done: binder_put_node(node); return ret; } static int binder_translate_fd(u32 fd, binder_size_t fd_offset, struct binder_transaction *t, struct binder_thread *thread, struct binder_transaction *in_reply_to) { struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_txn_fd_fixup *fixup; struct file *file; int ret = 0; bool target_allows_fd; if (in_reply_to) target_allows_fd = !!(in_reply_to->flags & TF_ACCEPT_FDS); else target_allows_fd = t->buffer->target_node->accept_fds; if (!target_allows_fd) { binder_user_error("%d:%d got %s with fd, %d, but target does not allow fds\n", proc->pid, thread->pid, in_reply_to ? "reply" : "transaction", fd); ret = -EPERM; goto err_fd_not_accepted; } file = fget(fd); if (!file) { binder_user_error("%d:%d got transaction with invalid fd, %d\n", proc->pid, thread->pid, fd); ret = -EBADF; goto err_fget; } ret = security_binder_transfer_file(proc->cred, target_proc->cred, file); if (ret < 0) { ret = -EPERM; goto err_security; } /* * Add fixup record for this transaction. The allocation * of the fd in the target needs to be done from a * target thread. */ fixup = kzalloc(sizeof(*fixup), GFP_KERNEL); if (!fixup) { ret = -ENOMEM; goto err_alloc; } fixup->file = file; fixup->offset = fd_offset; fixup->target_fd = -1; trace_binder_transaction_fd_send(t, fd, fixup->offset); list_add_tail(&fixup->fixup_entry, &t->fd_fixups); return ret; err_alloc: err_security: fput(file); err_fget: err_fd_not_accepted: return ret; } /** * struct binder_ptr_fixup - data to be fixed-up in target buffer * @offset offset in target buffer to fixup * @skip_size bytes to skip in copy (fixup will be written later) * @fixup_data data to write at fixup offset * @node list node * * This is used for the pointer fixup list (pf) which is created and consumed * during binder_transaction() and is only accessed locally. No * locking is necessary. * * The list is ordered by @offset. */ struct binder_ptr_fixup { binder_size_t offset; size_t skip_size; binder_uintptr_t fixup_data; struct list_head node; }; /** * struct binder_sg_copy - scatter-gather data to be copied * @offset offset in target buffer * @sender_uaddr user address in source buffer * @length bytes to copy * @node list node * * This is used for the sg copy list (sgc) which is created and consumed * during binder_transaction() and is only accessed locally. No * locking is necessary. * * The list is ordered by @offset. */ struct binder_sg_copy { binder_size_t offset; const void __user *sender_uaddr; size_t length; struct list_head node; }; /** * binder_do_deferred_txn_copies() - copy and fixup scatter-gather data * @alloc: binder_alloc associated with @buffer * @buffer: binder buffer in target process * @sgc_head: list_head of scatter-gather copy list * @pf_head: list_head of pointer fixup list * * Processes all elements of @sgc_head, applying fixups from @pf_head * and copying the scatter-gather data from the source process' user * buffer to the target's buffer. It is expected that the list creation * and processing all occurs during binder_transaction() so these lists * are only accessed in local context. * * Return: 0=success, else -errno */ static int binder_do_deferred_txn_copies(struct binder_alloc *alloc, struct binder_buffer *buffer, struct list_head *sgc_head, struct list_head *pf_head) { int ret = 0; struct binder_sg_copy *sgc, *tmpsgc; struct binder_ptr_fixup *tmppf; struct binder_ptr_fixup *pf = list_first_entry_or_null(pf_head, struct binder_ptr_fixup, node); list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) { size_t bytes_copied = 0; while (bytes_copied < sgc->length) { size_t copy_size; size_t bytes_left = sgc->length - bytes_copied; size_t offset = sgc->offset + bytes_copied; /* * We copy up to the fixup (pointed to by pf) */ copy_size = pf ? min(bytes_left, (size_t)pf->offset - offset) : bytes_left; if (!ret && copy_size) ret = binder_alloc_copy_user_to_buffer( alloc, buffer, offset, sgc->sender_uaddr + bytes_copied, copy_size); bytes_copied += copy_size; if (copy_size != bytes_left) { BUG_ON(!pf); /* we stopped at a fixup offset */ if (pf->skip_size) { /* * we are just skipping. This is for * BINDER_TYPE_FDA where the translated * fds will be fixed up when we get * to target context. */ bytes_copied += pf->skip_size; } else { /* apply the fixup indicated by pf */ if (!ret) ret = binder_alloc_copy_to_buffer( alloc, buffer, pf->offset, &pf->fixup_data, sizeof(pf->fixup_data)); bytes_copied += sizeof(pf->fixup_data); } list_del(&pf->node); kfree(pf); pf = list_first_entry_or_null(pf_head, struct binder_ptr_fixup, node); } } list_del(&sgc->node); kfree(sgc); } list_for_each_entry_safe(pf, tmppf, pf_head, node) { BUG_ON(pf->skip_size == 0); list_del(&pf->node); kfree(pf); } BUG_ON(!list_empty(sgc_head)); return ret > 0 ? -EINVAL : ret; } /** * binder_cleanup_deferred_txn_lists() - free specified lists * @sgc_head: list_head of scatter-gather copy list * @pf_head: list_head of pointer fixup list * * Called to clean up @sgc_head and @pf_head if there is an * error. */ static void binder_cleanup_deferred_txn_lists(struct list_head *sgc_head, struct list_head *pf_head) { struct binder_sg_copy *sgc, *tmpsgc; struct binder_ptr_fixup *pf, *tmppf; list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) { list_del(&sgc->node); kfree(sgc); } list_for_each_entry_safe(pf, tmppf, pf_head, node) { list_del(&pf->node); kfree(pf); } } /** * binder_defer_copy() - queue a scatter-gather buffer for copy * @sgc_head: list_head of scatter-gather copy list * @offset: binder buffer offset in target process * @sender_uaddr: user address in source process * @length: bytes to copy * * Specify a scatter-gather block to be copied. The actual copy must * be deferred until all the needed fixups are identified and queued. * Then the copy and fixups are done together so un-translated values * from the source are never visible in the target buffer. * * We are guaranteed that repeated calls to this function will have * monotonically increasing @offset values so the list will naturally * be ordered. * * Return: 0=success, else -errno */ static int binder_defer_copy(struct list_head *sgc_head, binder_size_t offset, const void __user *sender_uaddr, size_t length) { struct binder_sg_copy *bc = kzalloc(sizeof(*bc), GFP_KERNEL); if (!bc) return -ENOMEM; bc->offset = offset; bc->sender_uaddr = sender_uaddr; bc->length = length; INIT_LIST_HEAD(&bc->node); /* * We are guaranteed that the deferred copies are in-order * so just add to the tail. */ list_add_tail(&bc->node, sgc_head); return 0; } /** * binder_add_fixup() - queue a fixup to be applied to sg copy * @pf_head: list_head of binder ptr fixup list * @offset: binder buffer offset in target process * @fixup: bytes to be copied for fixup * @skip_size: bytes to skip when copying (fixup will be applied later) * * Add the specified fixup to a list ordered by @offset. When copying * the scatter-gather buffers, the fixup will be copied instead of * data from the source buffer. For BINDER_TYPE_FDA fixups, the fixup * will be applied later (in target process context), so we just skip * the bytes specified by @skip_size. If @skip_size is 0, we copy the * value in @fixup. * * This function is called *mostly* in @offset order, but there are * exceptions. Since out-of-order inserts are relatively uncommon, * we insert the new element by searching backward from the tail of * the list. * * Return: 0=success, else -errno */ static int binder_add_fixup(struct list_head *pf_head, binder_size_t offset, binder_uintptr_t fixup, size_t skip_size) { struct binder_ptr_fixup *pf = kzalloc(sizeof(*pf), GFP_KERNEL); struct binder_ptr_fixup *tmppf; if (!pf) return -ENOMEM; pf->offset = offset; pf->fixup_data = fixup; pf->skip_size = skip_size; INIT_LIST_HEAD(&pf->node); /* Fixups are *mostly* added in-order, but there are some * exceptions. Look backwards through list for insertion point. */ list_for_each_entry_reverse(tmppf, pf_head, node) { if (tmppf->offset < pf->offset) { list_add(&pf->node, &tmppf->node); return 0; } } /* * if we get here, then the new offset is the lowest so * insert at the head */ list_add(&pf->node, pf_head); return 0; } static int binder_translate_fd_array(struct list_head *pf_head, struct binder_fd_array_object *fda, const void __user *sender_ubuffer, struct binder_buffer_object *parent, struct binder_buffer_object *sender_uparent, struct binder_transaction *t, struct binder_thread *thread, struct binder_transaction *in_reply_to) { binder_size_t fdi, fd_buf_size; binder_size_t fda_offset; const void __user *sender_ufda_base; struct binder_proc *proc = thread->proc; int ret; if (fda->num_fds == 0) return 0; fd_buf_size = sizeof(u32) * fda->num_fds; if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { binder_user_error("%d:%d got transaction with invalid number of fds (%lld)\n", proc->pid, thread->pid, (u64)fda->num_fds); return -EINVAL; } if (fd_buf_size > parent->length || fda->parent_offset > parent->length - fd_buf_size) { /* No space for all file descriptors here. */ binder_user_error("%d:%d not enough space to store %lld fds in buffer\n", proc->pid, thread->pid, (u64)fda->num_fds); return -EINVAL; } /* * the source data for binder_buffer_object is visible * to user-space and the @buffer element is the user * pointer to the buffer_object containing the fd_array. * Convert the address to an offset relative to * the base of the transaction buffer. */ fda_offset = parent->buffer - t->buffer->user_data + fda->parent_offset; sender_ufda_base = (void __user *)(uintptr_t)sender_uparent->buffer + fda->parent_offset; if (!IS_ALIGNED((unsigned long)fda_offset, sizeof(u32)) || !IS_ALIGNED((unsigned long)sender_ufda_base, sizeof(u32))) { binder_user_error("%d:%d parent offset not aligned correctly.\n", proc->pid, thread->pid); return -EINVAL; } ret = binder_add_fixup(pf_head, fda_offset, 0, fda->num_fds * sizeof(u32)); if (ret) return ret; for (fdi = 0; fdi < fda->num_fds; fdi++) { u32 fd; binder_size_t offset = fda_offset + fdi * sizeof(fd); binder_size_t sender_uoffset = fdi * sizeof(fd); ret = copy_from_user(&fd, sender_ufda_base + sender_uoffset, sizeof(fd)); if (!ret) ret = binder_translate_fd(fd, offset, t, thread, in_reply_to); if (ret) return ret > 0 ? -EINVAL : ret; } return 0; } static int binder_fixup_parent(struct list_head *pf_head, struct binder_transaction *t, struct binder_thread *thread, struct binder_buffer_object *bp, binder_size_t off_start_offset, binder_size_t num_valid, binder_size_t last_fixup_obj_off, binder_size_t last_fixup_min_off) { struct binder_buffer_object *parent; struct binder_buffer *b = t->buffer; struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_object object; binder_size_t buffer_offset; binder_size_t parent_offset; if (!(bp->flags & BINDER_BUFFER_FLAG_HAS_PARENT)) return 0; parent = binder_validate_ptr(target_proc, b, &object, bp->parent, off_start_offset, &parent_offset, num_valid); if (!parent) { binder_user_error("%d:%d got transaction with invalid parent offset or type\n", proc->pid, thread->pid); return -EINVAL; } if (!binder_validate_fixup(target_proc, b, off_start_offset, parent_offset, bp->parent_offset, last_fixup_obj_off, last_fixup_min_off)) { binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n", proc->pid, thread->pid); return -EINVAL; } if (parent->length < sizeof(binder_uintptr_t) || bp->parent_offset > parent->length - sizeof(binder_uintptr_t)) { /* No space for a pointer here! */ binder_user_error("%d:%d got transaction with invalid parent offset\n", proc->pid, thread->pid); return -EINVAL; } buffer_offset = bp->parent_offset + parent->buffer - b->user_data; return binder_add_fixup(pf_head, buffer_offset, bp->buffer, 0); } /** * binder_can_update_transaction() - Can a txn be superseded by an updated one? * @t1: the pending async txn in the frozen process * @t2: the new async txn to supersede the outdated pending one * * Return: true if t2 can supersede t1 * false if t2 can not supersede t1 */ static bool binder_can_update_transaction(struct binder_transaction *t1, struct binder_transaction *t2) { if ((t1->flags & t2->flags & (TF_ONE_WAY | TF_UPDATE_TXN)) != (TF_ONE_WAY | TF_UPDATE_TXN) || !t1->to_proc || !t2->to_proc) return false; if (t1->to_proc->tsk == t2->to_proc->tsk && t1->code == t2->code && t1->flags == t2->flags && t1->buffer->pid == t2->buffer->pid && t1->buffer->target_node->ptr == t2->buffer->target_node->ptr && t1->buffer->target_node->cookie == t2->buffer->target_node->cookie) return true; return false; } /** * binder_find_outdated_transaction_ilocked() - Find the outdated transaction * @t: new async transaction * @target_list: list to find outdated transaction * * Return: the outdated transaction if found * NULL if no outdated transacton can be found * * Requires the proc->inner_lock to be held. */ static struct binder_transaction * binder_find_outdated_transaction_ilocked(struct binder_transaction *t, struct list_head *target_list) { struct binder_work *w; list_for_each_entry(w, target_list, entry) { struct binder_transaction *t_queued; if (w->type != BINDER_WORK_TRANSACTION) continue; t_queued = container_of(w, struct binder_transaction, work); if (binder_can_update_transaction(t_queued, t)) return t_queued; } return NULL; } /** * binder_proc_transaction() - sends a transaction to a process and wakes it up * @t: transaction to send * @proc: process to send the transaction to * @thread: thread in @proc to send the transaction to (may be NULL) * * This function queues a transaction to the specified process. It will try * to find a thread in the target process to handle the transaction and * wake it up. If no thread is found, the work is queued to the proc * waitqueue. * * If the @thread parameter is not NULL, the transaction is always queued * to the waitlist of that specific thread. * * Return: 0 if the transaction was successfully queued * BR_DEAD_REPLY if the target process or thread is dead * BR_FROZEN_REPLY if the target process or thread is frozen and * the sync transaction was rejected * BR_TRANSACTION_PENDING_FROZEN if the target process is frozen * and the async transaction was successfully queued */ static int binder_proc_transaction(struct binder_transaction *t, struct binder_proc *proc, struct binder_thread *thread) { struct binder_node *node = t->buffer->target_node; bool oneway = !!(t->flags & TF_ONE_WAY); bool pending_async = false; struct binder_transaction *t_outdated = NULL; bool frozen = false; BUG_ON(!node); binder_node_lock(node); if (oneway) { BUG_ON(thread); if (node->has_async_transaction) pending_async = true; else node->has_async_transaction = true; } binder_inner_proc_lock(proc); if (proc->is_frozen) { frozen = true; proc->sync_recv |= !oneway; proc->async_recv |= oneway; } if ((frozen && !oneway) || proc->is_dead || (thread && thread->is_dead)) { binder_inner_proc_unlock(proc); binder_node_unlock(node); return frozen ? BR_FROZEN_REPLY : BR_DEAD_REPLY; } if (!thread && !pending_async) thread = binder_select_thread_ilocked(proc); if (thread) { binder_enqueue_thread_work_ilocked(thread, &t->work); } else if (!pending_async) { binder_enqueue_work_ilocked(&t->work, &proc->todo); } else { if ((t->flags & TF_UPDATE_TXN) && frozen) { t_outdated = binder_find_outdated_transaction_ilocked(t, &node->async_todo); if (t_outdated) { binder_debug(BINDER_DEBUG_TRANSACTION, "txn %d supersedes %d\n", t->debug_id, t_outdated->debug_id); list_del_init(&t_outdated->work.entry); proc->outstanding_txns--; } } binder_enqueue_work_ilocked(&t->work, &node->async_todo); } if (!pending_async) binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */); proc->outstanding_txns++; binder_inner_proc_unlock(proc); binder_node_unlock(node); /* * To reduce potential contention, free the outdated transaction and * buffer after releasing the locks. */ if (t_outdated) { struct binder_buffer *buffer = t_outdated->buffer; t_outdated->buffer = NULL; buffer->transaction = NULL; trace_binder_transaction_update_buffer_release(buffer); binder_release_entire_buffer(proc, NULL, buffer, false); binder_alloc_free_buf(&proc->alloc, buffer); kfree(t_outdated); binder_stats_deleted(BINDER_STAT_TRANSACTION); } if (oneway && frozen) return BR_TRANSACTION_PENDING_FROZEN; return 0; } /** * binder_get_node_refs_for_txn() - Get required refs on node for txn * @node: struct binder_node for which to get refs * @procp: returns @node->proc if valid * @error: if no @procp then returns BR_DEAD_REPLY * * User-space normally keeps the node alive when creating a transaction * since it has a reference to the target. The local strong ref keeps it * alive if the sending process dies before the target process processes * the transaction. If the source process is malicious or has a reference * counting bug, relying on the local strong ref can fail. * * Since user-space can cause the local strong ref to go away, we also take * a tmpref on the node to ensure it survives while we are constructing * the transaction. We also need a tmpref on the proc while we are * constructing the transaction, so we take that here as well. * * Return: The target_node with refs taken or NULL if no @node->proc is NULL. * Also sets @procp if valid. If the @node->proc is NULL indicating that the * target proc has died, @error is set to BR_DEAD_REPLY. */ static struct binder_node *binder_get_node_refs_for_txn( struct binder_node *node, struct binder_proc **procp, uint32_t *error) { struct binder_node *target_node = NULL; binder_node_inner_lock(node); if (node->proc) { target_node = node; binder_inc_node_nilocked(node, 1, 0, NULL); binder_inc_node_tmpref_ilocked(node); node->proc->tmp_ref++; *procp = node->proc; } else *error = BR_DEAD_REPLY; binder_node_inner_unlock(node); return target_node; } static void binder_set_txn_from_error(struct binder_transaction *t, int id, uint32_t command, int32_t param) { struct binder_thread *from = binder_get_txn_from_and_acq_inner(t); if (!from) { /* annotation for sparse */ __release(&from->proc->inner_lock); return; } /* don't override existing errors */ if (from->ee.command == BR_OK) binder_set_extended_error(&from->ee, id, command, param); binder_inner_proc_unlock(from->proc); binder_thread_dec_tmpref(from); } static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, struct binder_transaction_data *tr, int reply, binder_size_t extra_buffers_size) { int ret; struct binder_transaction *t; struct binder_work *w; struct binder_work *tcomplete; binder_size_t buffer_offset = 0; binder_size_t off_start_offset, off_end_offset; binder_size_t off_min; binder_size_t sg_buf_offset, sg_buf_end_offset; binder_size_t user_offset = 0; struct binder_proc *target_proc = NULL; struct binder_thread *target_thread = NULL; struct binder_node *target_node = NULL; struct binder_transaction *in_reply_to = NULL; struct binder_transaction_log_entry *e; uint32_t return_error = 0; uint32_t return_error_param = 0; uint32_t return_error_line = 0; binder_size_t last_fixup_obj_off = 0; binder_size_t last_fixup_min_off = 0; struct binder_context *context = proc->context; int t_debug_id = atomic_inc_return(&binder_last_id); ktime_t t_start_time = ktime_get(); struct lsm_context lsmctx = { }; struct list_head sgc_head; struct list_head pf_head; const void __user *user_buffer = (const void __user *) (uintptr_t)tr->data.ptr.buffer; INIT_LIST_HEAD(&sgc_head); INIT_LIST_HEAD(&pf_head); e = binder_transaction_log_add(&binder_transaction_log); e->debug_id = t_debug_id; e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY); e->from_proc = proc->pid; e->from_thread = thread->pid; e->target_handle = tr->target.handle; e->data_size = tr->data_size; e->offsets_size = tr->offsets_size; strscpy(e->context_name, proc->context->name, BINDERFS_MAX_NAME); binder_inner_proc_lock(proc); binder_set_extended_error(&thread->ee, t_debug_id, BR_OK, 0); binder_inner_proc_unlock(proc); if (reply) { binder_inner_proc_lock(proc); in_reply_to = thread->transaction_stack; if (in_reply_to == NULL) { binder_inner_proc_unlock(proc); binder_user_error("%d:%d got reply transaction with no transaction stack\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; goto err_empty_call_stack; } if (in_reply_to->to_thread != thread) { spin_lock(&in_reply_to->lock); binder_user_error("%d:%d got reply transaction with bad transaction stack, transaction %d has target %d:%d\n", proc->pid, thread->pid, in_reply_to->debug_id, in_reply_to->to_proc ? in_reply_to->to_proc->pid : 0, in_reply_to->to_thread ? in_reply_to->to_thread->pid : 0); spin_unlock(&in_reply_to->lock); binder_inner_proc_unlock(proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; in_reply_to = NULL; goto err_bad_call_stack; } thread->transaction_stack = in_reply_to->to_parent; binder_inner_proc_unlock(proc); binder_set_nice(in_reply_to->saved_priority); target_thread = binder_get_txn_from_and_acq_inner(in_reply_to); if (target_thread == NULL) { /* annotation for sparse */ __release(&target_thread->proc->inner_lock); binder_txn_error("%d:%d reply target not found\n", thread->pid, proc->pid); return_error = BR_DEAD_REPLY; return_error_line = __LINE__; goto err_dead_binder; } if (target_thread->transaction_stack != in_reply_to) { binder_user_error("%d:%d got reply transaction with bad target transaction stack %d, expected %d\n", proc->pid, thread->pid, target_thread->transaction_stack ? target_thread->transaction_stack->debug_id : 0, in_reply_to->debug_id); binder_inner_proc_unlock(target_thread->proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; in_reply_to = NULL; target_thread = NULL; goto err_dead_binder; } target_proc = target_thread->proc; target_proc->tmp_ref++; binder_inner_proc_unlock(target_thread->proc); } else { if (tr->target.handle) { struct binder_ref *ref; /* * There must already be a strong ref * on this node. If so, do a strong * increment on the node to ensure it * stays alive until the transaction is * done. */ binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, tr->target.handle, true); if (ref) { target_node = binder_get_node_refs_for_txn( ref->node, &target_proc, &return_error); } else { binder_user_error("%d:%d got transaction to invalid handle, %u\n", proc->pid, thread->pid, tr->target.handle); return_error = BR_FAILED_REPLY; } binder_proc_unlock(proc); } else { mutex_lock(&context->context_mgr_node_lock); target_node = context->binder_context_mgr_node; if (target_node) target_node = binder_get_node_refs_for_txn( target_node, &target_proc, &return_error); else return_error = BR_DEAD_REPLY; mutex_unlock(&context->context_mgr_node_lock); if (target_node && target_proc->pid == proc->pid) { binder_user_error("%d:%d got transaction to context manager from process owning it\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_invalid_target_handle; } } if (!target_node) { binder_txn_error("%d:%d cannot find target node\n", thread->pid, proc->pid); /* * return_error is set above */ return_error_param = -EINVAL; return_error_line = __LINE__; goto err_dead_binder; } e->to_node = target_node->debug_id; if (WARN_ON(proc == target_proc)) { binder_txn_error("%d:%d self transactions not allowed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_invalid_target_handle; } if (security_binder_transaction(proc->cred, target_proc->cred) < 0) { binder_txn_error("%d:%d transaction credentials failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -EPERM; return_error_line = __LINE__; goto err_invalid_target_handle; } binder_inner_proc_lock(proc); w = list_first_entry_or_null(&thread->todo, struct binder_work, entry); if (!(tr->flags & TF_ONE_WAY) && w && w->type == BINDER_WORK_TRANSACTION) { /* * Do not allow new outgoing transaction from a * thread that has a transaction at the head of * its todo list. Only need to check the head * because binder_select_thread_ilocked picks a * thread from proc->waiting_threads to enqueue * the transaction, and nothing is queued to the * todo list while the thread is on waiting_threads. */ binder_user_error("%d:%d new transaction not allowed when there is a transaction on thread todo\n", proc->pid, thread->pid); binder_inner_proc_unlock(proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; goto err_bad_todo_list; } if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) { struct binder_transaction *tmp; tmp = thread->transaction_stack; if (tmp->to_thread != thread) { spin_lock(&tmp->lock); binder_user_error("%d:%d got new transaction with bad transaction stack, transaction %d has target %d:%d\n", proc->pid, thread->pid, tmp->debug_id, tmp->to_proc ? tmp->to_proc->pid : 0, tmp->to_thread ? tmp->to_thread->pid : 0); spin_unlock(&tmp->lock); binder_inner_proc_unlock(proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; goto err_bad_call_stack; } while (tmp) { struct binder_thread *from; spin_lock(&tmp->lock); from = tmp->from; if (from && from->proc == target_proc) { atomic_inc(&from->tmp_ref); target_thread = from; spin_unlock(&tmp->lock); break; } spin_unlock(&tmp->lock); tmp = tmp->from_parent; } } binder_inner_proc_unlock(proc); } if (target_thread) e->to_thread = target_thread->pid; e->to_proc = target_proc->pid; /* TODO: reuse incoming transaction for reply */ t = kzalloc(sizeof(*t), GFP_KERNEL); if (t == NULL) { binder_txn_error("%d:%d cannot allocate transaction\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -ENOMEM; return_error_line = __LINE__; goto err_alloc_t_failed; } INIT_LIST_HEAD(&t->fd_fixups); binder_stats_created(BINDER_STAT_TRANSACTION); spin_lock_init(&t->lock); tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL); if (tcomplete == NULL) { binder_txn_error("%d:%d cannot allocate work for transaction\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -ENOMEM; return_error_line = __LINE__; goto err_alloc_tcomplete_failed; } binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE); t->debug_id = t_debug_id; t->start_time = t_start_time; if (reply) binder_debug(BINDER_DEBUG_TRANSACTION, "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld-%lld\n", proc->pid, thread->pid, t->debug_id, target_proc->pid, target_thread->pid, (u64)tr->data.ptr.buffer, (u64)tr->data.ptr.offsets, (u64)tr->data_size, (u64)tr->offsets_size, (u64)extra_buffers_size); else binder_debug(BINDER_DEBUG_TRANSACTION, "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld-%lld\n", proc->pid, thread->pid, t->debug_id, target_proc->pid, target_node->debug_id, (u64)tr->data.ptr.buffer, (u64)tr->data.ptr.offsets, (u64)tr->data_size, (u64)tr->offsets_size, (u64)extra_buffers_size); if (!reply && !(tr->flags & TF_ONE_WAY)) t->from = thread; else t->from = NULL; t->from_pid = proc->pid; t->from_tid = thread->pid; t->sender_euid = task_euid(proc->tsk); t->to_proc = target_proc; t->to_thread = target_thread; t->code = tr->code; t->flags = tr->flags; t->priority = task_nice(current); if (target_node && target_node->txn_security_ctx) { u32 secid; size_t added_size; security_cred_getsecid(proc->cred, &secid); ret = security_secid_to_secctx(secid, &lsmctx); if (ret < 0) { binder_txn_error("%d:%d failed to get security context\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_get_secctx_failed; } added_size = ALIGN(lsmctx.len, sizeof(u64)); extra_buffers_size += added_size; if (extra_buffers_size < added_size) { binder_txn_error("%d:%d integer overflow of extra_buffers_size\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_extra_size; } } trace_binder_transaction(reply, t, target_node); t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size, tr->offsets_size, extra_buffers_size, !reply && (t->flags & TF_ONE_WAY)); if (IS_ERR(t->buffer)) { char *s; ret = PTR_ERR(t->buffer); s = (ret == -ESRCH) ? ": vma cleared, target dead or dying" : (ret == -ENOSPC) ? ": no space left" : (ret == -ENOMEM) ? ": memory allocation failed" : ""; binder_txn_error("cannot allocate buffer%s", s); return_error_param = PTR_ERR(t->buffer); return_error = return_error_param == -ESRCH ? BR_DEAD_REPLY : BR_FAILED_REPLY; return_error_line = __LINE__; t->buffer = NULL; goto err_binder_alloc_buf_failed; } if (lsmctx.context) { int err; size_t buf_offset = ALIGN(tr->data_size, sizeof(void *)) + ALIGN(tr->offsets_size, sizeof(void *)) + ALIGN(extra_buffers_size, sizeof(void *)) - ALIGN(lsmctx.len, sizeof(u64)); t->security_ctx = t->buffer->user_data + buf_offset; err = binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, buf_offset, lsmctx.context, lsmctx.len); if (err) { t->security_ctx = 0; WARN_ON(1); } security_release_secctx(&lsmctx); lsmctx.context = NULL; } t->buffer->debug_id = t->debug_id; t->buffer->transaction = t; t->buffer->target_node = target_node; t->buffer->clear_on_free = !!(t->flags & TF_CLEAR_BUF); trace_binder_transaction_alloc_buf(t->buffer); if (binder_alloc_copy_user_to_buffer( &target_proc->alloc, t->buffer, ALIGN(tr->data_size, sizeof(void *)), (const void __user *) (uintptr_t)tr->data.ptr.offsets, tr->offsets_size)) { binder_user_error("%d:%d got transaction with invalid offsets ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EFAULT; return_error_line = __LINE__; goto err_copy_data_failed; } if (!IS_ALIGNED(tr->offsets_size, sizeof(binder_size_t))) { binder_user_error("%d:%d got transaction with invalid offsets size, %lld\n", proc->pid, thread->pid, (u64)tr->offsets_size); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } if (!IS_ALIGNED(extra_buffers_size, sizeof(u64))) { binder_user_error("%d:%d got transaction with unaligned buffers size, %lld\n", proc->pid, thread->pid, (u64)extra_buffers_size); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } off_start_offset = ALIGN(tr->data_size, sizeof(void *)); buffer_offset = off_start_offset; off_end_offset = off_start_offset + tr->offsets_size; sg_buf_offset = ALIGN(off_end_offset, sizeof(void *)); sg_buf_end_offset = sg_buf_offset + extra_buffers_size - ALIGN(lsmctx.len, sizeof(u64)); off_min = 0; for (buffer_offset = off_start_offset; buffer_offset < off_end_offset; buffer_offset += sizeof(binder_size_t)) { struct binder_object_header *hdr; size_t object_size; struct binder_object object; binder_size_t object_offset; binder_size_t copy_size; if (binder_alloc_copy_from_buffer(&target_proc->alloc, &object_offset, t->buffer, buffer_offset, sizeof(object_offset))) { binder_txn_error("%d:%d copy offset from buffer failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } /* * Copy the source user buffer up to the next object * that will be processed. */ copy_size = object_offset - user_offset; if (copy_size && (user_offset > object_offset || object_offset > tr->data_size || binder_alloc_copy_user_to_buffer( &target_proc->alloc, t->buffer, user_offset, user_buffer + user_offset, copy_size))) { binder_user_error("%d:%d got transaction with invalid data ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EFAULT; return_error_line = __LINE__; goto err_copy_data_failed; } object_size = binder_get_object(target_proc, user_buffer, t->buffer, object_offset, &object); if (object_size == 0 || object_offset < off_min) { binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n", proc->pid, thread->pid, (u64)object_offset, (u64)off_min, (u64)t->buffer->data_size); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } /* * Set offset to the next buffer fragment to be * copied */ user_offset = object_offset + object_size; hdr = &object.hdr; off_min = object_offset + object_size; switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { struct flat_binder_object *fp; fp = to_flat_binder_object(hdr); ret = binder_translate_binder(fp, t, thread); if (ret < 0 || binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, fp, sizeof(*fp))) { binder_txn_error("%d:%d translate binder failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { struct flat_binder_object *fp; fp = to_flat_binder_object(hdr); ret = binder_translate_handle(fp, t, thread); if (ret < 0 || binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, fp, sizeof(*fp))) { binder_txn_error("%d:%d translate handle failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } } break; case BINDER_TYPE_FD: { struct binder_fd_object *fp = to_binder_fd_object(hdr); binder_size_t fd_offset = object_offset + (uintptr_t)&fp->fd - (uintptr_t)fp; int ret = binder_translate_fd(fp->fd, fd_offset, t, thread, in_reply_to); fp->pad_binder = 0; if (ret < 0 || binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, fp, sizeof(*fp))) { binder_txn_error("%d:%d translate fd failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } } break; case BINDER_TYPE_FDA: { struct binder_object ptr_object; binder_size_t parent_offset; struct binder_object user_object; size_t user_parent_size; struct binder_fd_array_object *fda = to_binder_fd_array_object(hdr); size_t num_valid = (buffer_offset - off_start_offset) / sizeof(binder_size_t); struct binder_buffer_object *parent = binder_validate_ptr(target_proc, t->buffer, &ptr_object, fda->parent, off_start_offset, &parent_offset, num_valid); if (!parent) { binder_user_error("%d:%d got transaction with invalid parent offset or type\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_parent; } if (!binder_validate_fixup(target_proc, t->buffer, off_start_offset, parent_offset, fda->parent_offset, last_fixup_obj_off, last_fixup_min_off)) { binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_parent; } /* * We need to read the user version of the parent * object to get the original user offset */ user_parent_size = binder_get_object(proc, user_buffer, t->buffer, parent_offset, &user_object); if (user_parent_size != sizeof(user_object.bbo)) { binder_user_error("%d:%d invalid ptr object size: %zd vs %zd\n", proc->pid, thread->pid, user_parent_size, sizeof(user_object.bbo)); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_parent; } ret = binder_translate_fd_array(&pf_head, fda, user_buffer, parent, &user_object.bbo, t, thread, in_reply_to); if (!ret) ret = binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, fda, sizeof(*fda)); if (ret) { binder_txn_error("%d:%d translate fd array failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret > 0 ? -EINVAL : ret; return_error_line = __LINE__; goto err_translate_failed; } last_fixup_obj_off = parent_offset; last_fixup_min_off = fda->parent_offset + sizeof(u32) * fda->num_fds; } break; case BINDER_TYPE_PTR: { struct binder_buffer_object *bp = to_binder_buffer_object(hdr); size_t buf_left = sg_buf_end_offset - sg_buf_offset; size_t num_valid; if (bp->length > buf_left) { binder_user_error("%d:%d got transaction with too large buffer\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } ret = binder_defer_copy(&sgc_head, sg_buf_offset, (const void __user *)(uintptr_t)bp->buffer, bp->length); if (ret) { binder_txn_error("%d:%d deferred copy failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } /* Fixup buffer pointer to target proc address space */ bp->buffer = t->buffer->user_data + sg_buf_offset; sg_buf_offset += ALIGN(bp->length, sizeof(u64)); num_valid = (buffer_offset - off_start_offset) / sizeof(binder_size_t); ret = binder_fixup_parent(&pf_head, t, thread, bp, off_start_offset, num_valid, last_fixup_obj_off, last_fixup_min_off); if (ret < 0 || binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, bp, sizeof(*bp))) { binder_txn_error("%d:%d failed to fixup parent\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } last_fixup_obj_off = object_offset; last_fixup_min_off = 0; } break; default: binder_user_error("%d:%d got transaction with invalid object type, %x\n", proc->pid, thread->pid, hdr->type); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_object_type; } } /* Done processing objects, copy the rest of the buffer */ if (binder_alloc_copy_user_to_buffer( &target_proc->alloc, t->buffer, user_offset, user_buffer + user_offset, tr->data_size - user_offset)) { binder_user_error("%d:%d got transaction with invalid data ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EFAULT; return_error_line = __LINE__; goto err_copy_data_failed; } ret = binder_do_deferred_txn_copies(&target_proc->alloc, t->buffer, &sgc_head, &pf_head); if (ret) { binder_user_error("%d:%d got transaction with invalid offsets ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_copy_data_failed; } if (t->buffer->oneway_spam_suspect) tcomplete->type = BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT; else tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; t->work.type = BINDER_WORK_TRANSACTION; if (reply) { binder_enqueue_thread_work(thread, tcomplete); binder_inner_proc_lock(target_proc); if (target_thread->is_dead) { return_error = BR_DEAD_REPLY; binder_inner_proc_unlock(target_proc); goto err_dead_proc_or_thread; } BUG_ON(t->buffer->async_transaction != 0); binder_pop_transaction_ilocked(target_thread, in_reply_to); binder_enqueue_thread_work_ilocked(target_thread, &t->work); target_proc->outstanding_txns++; binder_inner_proc_unlock(target_proc); wake_up_interruptible_sync(&target_thread->wait); binder_free_transaction(in_reply_to); } else if (!(t->flags & TF_ONE_WAY)) { BUG_ON(t->buffer->async_transaction != 0); binder_inner_proc_lock(proc); /* * Defer the TRANSACTION_COMPLETE, so we don't return to * userspace immediately; this allows the target process to * immediately start processing this transaction, reducing * latency. We will then return the TRANSACTION_COMPLETE when * the target replies (or there is an error). */ binder_enqueue_deferred_thread_work_ilocked(thread, tcomplete); t->need_reply = 1; t->from_parent = thread->transaction_stack; thread->transaction_stack = t; binder_inner_proc_unlock(proc); return_error = binder_proc_transaction(t, target_proc, target_thread); if (return_error) { binder_inner_proc_lock(proc); binder_pop_transaction_ilocked(thread, t); binder_inner_proc_unlock(proc); goto err_dead_proc_or_thread; } } else { BUG_ON(target_node == NULL); BUG_ON(t->buffer->async_transaction != 1); return_error = binder_proc_transaction(t, target_proc, NULL); /* * Let the caller know when async transaction reaches a frozen * process and is put in a pending queue, waiting for the target * process to be unfrozen. */ if (return_error == BR_TRANSACTION_PENDING_FROZEN) tcomplete->type = BINDER_WORK_TRANSACTION_PENDING; binder_enqueue_thread_work(thread, tcomplete); if (return_error && return_error != BR_TRANSACTION_PENDING_FROZEN) goto err_dead_proc_or_thread; } if (target_thread) binder_thread_dec_tmpref(target_thread); binder_proc_dec_tmpref(target_proc); if (target_node) binder_dec_node_tmpref(target_node); /* * write barrier to synchronize with initialization * of log entry */ smp_wmb(); WRITE_ONCE(e->debug_id_done, t_debug_id); return; err_dead_proc_or_thread: binder_txn_error("%d:%d dead process or thread\n", thread->pid, proc->pid); return_error_line = __LINE__; binder_dequeue_work(proc, tcomplete); err_translate_failed: err_bad_object_type: err_bad_offset: err_bad_parent: err_copy_data_failed: binder_cleanup_deferred_txn_lists(&sgc_head, &pf_head); binder_free_txn_fixups(t); trace_binder_transaction_failed_buffer_release(t->buffer); binder_transaction_buffer_release(target_proc, NULL, t->buffer, buffer_offset, true); if (target_node) binder_dec_node_tmpref(target_node); target_node = NULL; t->buffer->transaction = NULL; binder_alloc_free_buf(&target_proc->alloc, t->buffer); err_binder_alloc_buf_failed: err_bad_extra_size: if (lsmctx.context) security_release_secctx(&lsmctx); err_get_secctx_failed: kfree(tcomplete); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); err_alloc_tcomplete_failed: if (trace_binder_txn_latency_free_enabled()) binder_txn_latency_free(t); kfree(t); binder_stats_deleted(BINDER_STAT_TRANSACTION); err_alloc_t_failed: err_bad_todo_list: err_bad_call_stack: err_empty_call_stack: err_dead_binder: err_invalid_target_handle: if (target_node) { binder_dec_node(target_node, 1, 0); binder_dec_node_tmpref(target_node); } binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d transaction %s to %d:%d failed %d/%d/%d, code %u size %lld-%lld line %d\n", proc->pid, thread->pid, reply ? "reply" : (tr->flags & TF_ONE_WAY ? "async" : "call"), target_proc ? target_proc->pid : 0, target_thread ? target_thread->pid : 0, t_debug_id, return_error, return_error_param, tr->code, (u64)tr->data_size, (u64)tr->offsets_size, return_error_line); if (target_thread) binder_thread_dec_tmpref(target_thread); if (target_proc) binder_proc_dec_tmpref(target_proc); { struct binder_transaction_log_entry *fe; e->return_error = return_error; e->return_error_param = return_error_param; e->return_error_line = return_error_line; fe = binder_transaction_log_add(&binder_transaction_log_failed); *fe = *e; /* * write barrier to synchronize with initialization * of log entry */ smp_wmb(); WRITE_ONCE(e->debug_id_done, t_debug_id); WRITE_ONCE(fe->debug_id_done, t_debug_id); } BUG_ON(thread->return_error.cmd != BR_OK); if (in_reply_to) { binder_set_txn_from_error(in_reply_to, t_debug_id, return_error, return_error_param); thread->return_error.cmd = BR_TRANSACTION_COMPLETE; binder_enqueue_thread_work(thread, &thread->return_error.work); binder_send_failed_reply(in_reply_to, return_error); } else { binder_inner_proc_lock(proc); binder_set_extended_error(&thread->ee, t_debug_id, return_error, return_error_param); binder_inner_proc_unlock(proc); thread->return_error.cmd = return_error; binder_enqueue_thread_work(thread, &thread->return_error.work); } } static int binder_request_freeze_notification(struct binder_proc *proc, struct binder_thread *thread, struct binder_handle_cookie *handle_cookie) { struct binder_ref_freeze *freeze; struct binder_ref *ref; freeze = kzalloc(sizeof(*freeze), GFP_KERNEL); if (!freeze) return -ENOMEM; binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, handle_cookie->handle, false); if (!ref) { binder_user_error("%d:%d BC_REQUEST_FREEZE_NOTIFICATION invalid ref %d\n", proc->pid, thread->pid, handle_cookie->handle); binder_proc_unlock(proc); kfree(freeze); return -EINVAL; } binder_node_lock(ref->node); if (ref->freeze) { binder_user_error("%d:%d BC_REQUEST_FREEZE_NOTIFICATION already set\n", proc->pid, thread->pid); binder_node_unlock(ref->node); binder_proc_unlock(proc); kfree(freeze); return -EINVAL; } binder_stats_created(BINDER_STAT_FREEZE); INIT_LIST_HEAD(&freeze->work.entry); freeze->cookie = handle_cookie->cookie; freeze->work.type = BINDER_WORK_FROZEN_BINDER; ref->freeze = freeze; if (ref->node->proc) { binder_inner_proc_lock(ref->node->proc); freeze->is_frozen = ref->node->proc->is_frozen; binder_inner_proc_unlock(ref->node->proc); binder_inner_proc_lock(proc); binder_enqueue_work_ilocked(&freeze->work, &proc->todo); binder_wakeup_proc_ilocked(proc); binder_inner_proc_unlock(proc); } binder_node_unlock(ref->node); binder_proc_unlock(proc); return 0; } static int binder_clear_freeze_notification(struct binder_proc *proc, struct binder_thread *thread, struct binder_handle_cookie *handle_cookie) { struct binder_ref_freeze *freeze; struct binder_ref *ref; binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, handle_cookie->handle, false); if (!ref) { binder_user_error("%d:%d BC_CLEAR_FREEZE_NOTIFICATION invalid ref %d\n", proc->pid, thread->pid, handle_cookie->handle); binder_proc_unlock(proc); return -EINVAL; } binder_node_lock(ref->node); if (!ref->freeze) { binder_user_error("%d:%d BC_CLEAR_FREEZE_NOTIFICATION freeze notification not active\n", proc->pid, thread->pid); binder_node_unlock(ref->node); binder_proc_unlock(proc); return -EINVAL; } freeze = ref->freeze; binder_inner_proc_lock(proc); if (freeze->cookie != handle_cookie->cookie) { binder_user_error("%d:%d BC_CLEAR_FREEZE_NOTIFICATION freeze notification cookie mismatch %016llx != %016llx\n", proc->pid, thread->pid, (u64)freeze->cookie, (u64)handle_cookie->cookie); binder_inner_proc_unlock(proc); binder_node_unlock(ref->node); binder_proc_unlock(proc); return -EINVAL; } ref->freeze = NULL; /* * Take the existing freeze object and overwrite its work type. There are three cases here: * 1. No pending notification. In this case just add the work to the queue. * 2. A notification was sent and is pending an ack from userspace. Once an ack arrives, we * should resend with the new work type. * 3. A notification is pending to be sent. Since the work is already in the queue, nothing * needs to be done here. */ freeze->work.type = BINDER_WORK_CLEAR_FREEZE_NOTIFICATION; if (list_empty(&freeze->work.entry)) { binder_enqueue_work_ilocked(&freeze->work, &proc->todo); binder_wakeup_proc_ilocked(proc); } else if (freeze->sent) { freeze->resend = true; } binder_inner_proc_unlock(proc); binder_node_unlock(ref->node); binder_proc_unlock(proc); return 0; } static int binder_freeze_notification_done(struct binder_proc *proc, struct binder_thread *thread, binder_uintptr_t cookie) { struct binder_ref_freeze *freeze = NULL; struct binder_work *w; binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->delivered_freeze, entry) { struct binder_ref_freeze *tmp_freeze = container_of(w, struct binder_ref_freeze, work); if (tmp_freeze->cookie == cookie) { freeze = tmp_freeze; break; } } if (!freeze) { binder_user_error("%d:%d BC_FREEZE_NOTIFICATION_DONE %016llx not found\n", proc->pid, thread->pid, (u64)cookie); binder_inner_proc_unlock(proc); return -EINVAL; } binder_dequeue_work_ilocked(&freeze->work); freeze->sent = false; if (freeze->resend) { freeze->resend = false; binder_enqueue_work_ilocked(&freeze->work, &proc->todo); binder_wakeup_proc_ilocked(proc); } binder_inner_proc_unlock(proc); return 0; } /** * binder_free_buf() - free the specified buffer * @proc: binder proc that owns buffer * @buffer: buffer to be freed * @is_failure: failed to send transaction * * If buffer for an async transaction, enqueue the next async * transaction from the node. * * Cleanup buffer and free it. */ static void binder_free_buf(struct binder_proc *proc, struct binder_thread *thread, struct binder_buffer *buffer, bool is_failure) { binder_inner_proc_lock(proc); if (buffer->transaction) { buffer->transaction->buffer = NULL; buffer->transaction = NULL; } binder_inner_proc_unlock(proc); if (buffer->async_transaction && buffer->target_node) { struct binder_node *buf_node; struct binder_work *w; buf_node = buffer->target_node; binder_node_inner_lock(buf_node); BUG_ON(!buf_node->has_async_transaction); BUG_ON(buf_node->proc != proc); w = binder_dequeue_work_head_ilocked( &buf_node->async_todo); if (!w) { buf_node->has_async_transaction = false; } else { binder_enqueue_work_ilocked( w, &proc->todo); binder_wakeup_proc_ilocked(proc); } binder_node_inner_unlock(buf_node); } trace_binder_transaction_buffer_release(buffer); binder_release_entire_buffer(proc, thread, buffer, is_failure); binder_alloc_free_buf(&proc->alloc, buffer); } static int binder_thread_write(struct binder_proc *proc, struct binder_thread *thread, binder_uintptr_t binder_buffer, size_t size, binder_size_t *consumed) { uint32_t cmd; struct binder_context *context = proc->context; void __user *buffer = (void __user *)(uintptr_t)binder_buffer; void __user *ptr = buffer + *consumed; void __user *end = buffer + size; while (ptr < end && thread->return_error.cmd == BR_OK) { int ret; if (get_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); trace_binder_command(cmd); if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) { atomic_inc(&binder_stats.bc[_IOC_NR(cmd)]); atomic_inc(&proc->stats.bc[_IOC_NR(cmd)]); atomic_inc(&thread->stats.bc[_IOC_NR(cmd)]); } switch (cmd) { case BC_INCREFS: case BC_ACQUIRE: case BC_RELEASE: case BC_DECREFS: { uint32_t target; const char *debug_string; bool strong = cmd == BC_ACQUIRE || cmd == BC_RELEASE; bool increment = cmd == BC_INCREFS || cmd == BC_ACQUIRE; struct binder_ref_data rdata; if (get_user(target, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); ret = -1; if (increment && !target) { struct binder_node *ctx_mgr_node; mutex_lock(&context->context_mgr_node_lock); ctx_mgr_node = context->binder_context_mgr_node; if (ctx_mgr_node) { if (ctx_mgr_node->proc == proc) { binder_user_error("%d:%d context manager tried to acquire desc 0\n", proc->pid, thread->pid); mutex_unlock(&context->context_mgr_node_lock); return -EINVAL; } ret = binder_inc_ref_for_node( proc, ctx_mgr_node, strong, NULL, &rdata); } mutex_unlock(&context->context_mgr_node_lock); } if (ret) ret = binder_update_ref_for_handle( proc, target, increment, strong, &rdata); if (!ret && rdata.desc != target) { binder_user_error("%d:%d tried to acquire reference to desc %d, got %d instead\n", proc->pid, thread->pid, target, rdata.desc); } switch (cmd) { case BC_INCREFS: debug_string = "IncRefs"; break; case BC_ACQUIRE: debug_string = "Acquire"; break; case BC_RELEASE: debug_string = "Release"; break; case BC_DECREFS: default: debug_string = "DecRefs"; break; } if (ret) { binder_user_error("%d:%d %s %d refcount change on invalid ref %d ret %d\n", proc->pid, thread->pid, debug_string, strong, target, ret); break; } binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s ref %d desc %d s %d w %d\n", proc->pid, thread->pid, debug_string, rdata.debug_id, rdata.desc, rdata.strong, rdata.weak); break; } case BC_INCREFS_DONE: case BC_ACQUIRE_DONE: { binder_uintptr_t node_ptr; binder_uintptr_t cookie; struct binder_node *node; bool free_node; if (get_user(node_ptr, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); node = binder_get_node(proc, node_ptr); if (node == NULL) { binder_user_error("%d:%d %s u%016llx no match\n", proc->pid, thread->pid, cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", (u64)node_ptr); break; } if (cookie != node->cookie) { binder_user_error("%d:%d %s u%016llx node %d cookie mismatch %016llx != %016llx\n", proc->pid, thread->pid, cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", (u64)node_ptr, node->debug_id, (u64)cookie, (u64)node->cookie); binder_put_node(node); break; } binder_node_inner_lock(node); if (cmd == BC_ACQUIRE_DONE) { if (node->pending_strong_ref == 0) { binder_user_error("%d:%d BC_ACQUIRE_DONE node %d has no pending acquire request\n", proc->pid, thread->pid, node->debug_id); binder_node_inner_unlock(node); binder_put_node(node); break; } node->pending_strong_ref = 0; } else { if (node->pending_weak_ref == 0) { binder_user_error("%d:%d BC_INCREFS_DONE node %d has no pending increfs request\n", proc->pid, thread->pid, node->debug_id); binder_node_inner_unlock(node); binder_put_node(node); break; } node->pending_weak_ref = 0; } free_node = binder_dec_node_nilocked(node, cmd == BC_ACQUIRE_DONE, 0); WARN_ON(free_node); binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s node %d ls %d lw %d tr %d\n", proc->pid, thread->pid, cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", node->debug_id, node->local_strong_refs, node->local_weak_refs, node->tmp_refs); binder_node_inner_unlock(node); binder_put_node(node); break; } case BC_ATTEMPT_ACQUIRE: pr_err("BC_ATTEMPT_ACQUIRE not supported\n"); return -EINVAL; case BC_ACQUIRE_RESULT: pr_err("BC_ACQUIRE_RESULT not supported\n"); return -EINVAL; case BC_FREE_BUFFER: { binder_uintptr_t data_ptr; struct binder_buffer *buffer; if (get_user(data_ptr, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); buffer = binder_alloc_prepare_to_free(&proc->alloc, data_ptr); if (IS_ERR_OR_NULL(buffer)) { if (PTR_ERR(buffer) == -EPERM) { binder_user_error( "%d:%d BC_FREE_BUFFER u%016llx matched unreturned or currently freeing buffer\n", proc->pid, thread->pid, (u64)data_ptr); } else { binder_user_error( "%d:%d BC_FREE_BUFFER u%016llx no match\n", proc->pid, thread->pid, (u64)data_ptr); } break; } binder_debug(BINDER_DEBUG_FREE_BUFFER, "%d:%d BC_FREE_BUFFER u%016llx found buffer %d for %s transaction\n", proc->pid, thread->pid, (u64)data_ptr, buffer->debug_id, buffer->transaction ? "active" : "finished"); binder_free_buf(proc, thread, buffer, false); break; } case BC_TRANSACTION_SG: case BC_REPLY_SG: { struct binder_transaction_data_sg tr; if (copy_from_user(&tr, ptr, sizeof(tr))) return -EFAULT; ptr += sizeof(tr); binder_transaction(proc, thread, &tr.transaction_data, cmd == BC_REPLY_SG, tr.buffers_size); break; } case BC_TRANSACTION: case BC_REPLY: { struct binder_transaction_data tr; if (copy_from_user(&tr, ptr, sizeof(tr))) return -EFAULT; ptr += sizeof(tr); binder_transaction(proc, thread, &tr, cmd == BC_REPLY, 0); break; } case BC_REGISTER_LOOPER: binder_debug(BINDER_DEBUG_THREADS, "%d:%d BC_REGISTER_LOOPER\n", proc->pid, thread->pid); binder_inner_proc_lock(proc); if (thread->looper & BINDER_LOOPER_STATE_ENTERED) { thread->looper |= BINDER_LOOPER_STATE_INVALID; binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called after BC_ENTER_LOOPER\n", proc->pid, thread->pid); } else if (proc->requested_threads == 0) { thread->looper |= BINDER_LOOPER_STATE_INVALID; binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called without request\n", proc->pid, thread->pid); } else { proc->requested_threads--; proc->requested_threads_started++; } thread->looper |= BINDER_LOOPER_STATE_REGISTERED; binder_inner_proc_unlock(proc); break; case BC_ENTER_LOOPER: binder_debug(BINDER_DEBUG_THREADS, "%d:%d BC_ENTER_LOOPER\n", proc->pid, thread->pid); if (thread->looper & BINDER_LOOPER_STATE_REGISTERED) { thread->looper |= BINDER_LOOPER_STATE_INVALID; binder_user_error("%d:%d ERROR: BC_ENTER_LOOPER called after BC_REGISTER_LOOPER\n", proc->pid, thread->pid); } thread->looper |= BINDER_LOOPER_STATE_ENTERED; break; case BC_EXIT_LOOPER: binder_debug(BINDER_DEBUG_THREADS, "%d:%d BC_EXIT_LOOPER\n", proc->pid, thread->pid); thread->looper |= BINDER_LOOPER_STATE_EXITED; break; case BC_REQUEST_DEATH_NOTIFICATION: case BC_CLEAR_DEATH_NOTIFICATION: { uint32_t target; binder_uintptr_t cookie; struct binder_ref *ref; struct binder_ref_death *death = NULL; if (get_user(target, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); if (cmd == BC_REQUEST_DEATH_NOTIFICATION) { /* * Allocate memory for death notification * before taking lock */ death = kzalloc(sizeof(*death), GFP_KERNEL); if (death == NULL) { WARN_ON(thread->return_error.cmd != BR_OK); thread->return_error.cmd = BR_ERROR; binder_enqueue_thread_work( thread, &thread->return_error.work); binder_debug( BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n", proc->pid, thread->pid); break; } } binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, target, false); if (ref == NULL) { binder_user_error("%d:%d %s invalid ref %d\n", proc->pid, thread->pid, cmd == BC_REQUEST_DEATH_NOTIFICATION ? "BC_REQUEST_DEATH_NOTIFICATION" : "BC_CLEAR_DEATH_NOTIFICATION", target); binder_proc_unlock(proc); kfree(death); break; } binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION, "%d:%d %s %016llx ref %d desc %d s %d w %d for node %d\n", proc->pid, thread->pid, cmd == BC_REQUEST_DEATH_NOTIFICATION ? "BC_REQUEST_DEATH_NOTIFICATION" : "BC_CLEAR_DEATH_NOTIFICATION", (u64)cookie, ref->data.debug_id, ref->data.desc, ref->data.strong, ref->data.weak, ref->node->debug_id); binder_node_lock(ref->node); if (cmd == BC_REQUEST_DEATH_NOTIFICATION) { if (ref->death) { binder_user_error("%d:%d BC_REQUEST_DEATH_NOTIFICATION death notification already set\n", proc->pid, thread->pid); binder_node_unlock(ref->node); binder_proc_unlock(proc); kfree(death); break; } binder_stats_created(BINDER_STAT_DEATH); INIT_LIST_HEAD(&death->work.entry); death->cookie = cookie; ref->death = death; if (ref->node->proc == NULL) { ref->death->work.type = BINDER_WORK_DEAD_BINDER; binder_inner_proc_lock(proc); binder_enqueue_work_ilocked( &ref->death->work, &proc->todo); binder_wakeup_proc_ilocked(proc); binder_inner_proc_unlock(proc); } } else { if (ref->death == NULL) { binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification not active\n", proc->pid, thread->pid); binder_node_unlock(ref->node); binder_proc_unlock(proc); break; } death = ref->death; if (death->cookie != cookie) { binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification cookie mismatch %016llx != %016llx\n", proc->pid, thread->pid, (u64)death->cookie, (u64)cookie); binder_node_unlock(ref->node); binder_proc_unlock(proc); break; } ref->death = NULL; binder_inner_proc_lock(proc); if (list_empty(&death->work.entry)) { death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION; if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) binder_enqueue_thread_work_ilocked( thread, &death->work); else { binder_enqueue_work_ilocked( &death->work, &proc->todo); binder_wakeup_proc_ilocked( proc); } } else { BUG_ON(death->work.type != BINDER_WORK_DEAD_BINDER); death->work.type = BINDER_WORK_DEAD_BINDER_AND_CLEAR; } binder_inner_proc_unlock(proc); } binder_node_unlock(ref->node); binder_proc_unlock(proc); } break; case BC_DEAD_BINDER_DONE: { struct binder_work *w; binder_uintptr_t cookie; struct binder_ref_death *death = NULL; if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(cookie); binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->delivered_death, entry) { struct binder_ref_death *tmp_death = container_of(w, struct binder_ref_death, work); if (tmp_death->cookie == cookie) { death = tmp_death; break; } } binder_debug(BINDER_DEBUG_DEAD_BINDER, "%d:%d BC_DEAD_BINDER_DONE %016llx found %pK\n", proc->pid, thread->pid, (u64)cookie, death); if (death == NULL) { binder_user_error("%d:%d BC_DEAD_BINDER_DONE %016llx not found\n", proc->pid, thread->pid, (u64)cookie); binder_inner_proc_unlock(proc); break; } binder_dequeue_work_ilocked(&death->work); if (death->work.type == BINDER_WORK_DEAD_BINDER_AND_CLEAR) { death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION; if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) binder_enqueue_thread_work_ilocked( thread, &death->work); else { binder_enqueue_work_ilocked( &death->work, &proc->todo); binder_wakeup_proc_ilocked(proc); } } binder_inner_proc_unlock(proc); } break; case BC_REQUEST_FREEZE_NOTIFICATION: { struct binder_handle_cookie handle_cookie; int error; if (copy_from_user(&handle_cookie, ptr, sizeof(handle_cookie))) return -EFAULT; ptr += sizeof(handle_cookie); error = binder_request_freeze_notification(proc, thread, &handle_cookie); if (error) return error; } break; case BC_CLEAR_FREEZE_NOTIFICATION: { struct binder_handle_cookie handle_cookie; int error; if (copy_from_user(&handle_cookie, ptr, sizeof(handle_cookie))) return -EFAULT; ptr += sizeof(handle_cookie); error = binder_clear_freeze_notification(proc, thread, &handle_cookie); if (error) return error; } break; case BC_FREEZE_NOTIFICATION_DONE: { binder_uintptr_t cookie; int error; if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(cookie); error = binder_freeze_notification_done(proc, thread, cookie); if (error) return error; } break; default: pr_err("%d:%d unknown command %u\n", proc->pid, thread->pid, cmd); return -EINVAL; } *consumed = ptr - buffer; } return 0; } static void binder_stat_br(struct binder_proc *proc, struct binder_thread *thread, uint32_t cmd) { trace_binder_return(cmd); if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) { atomic_inc(&binder_stats.br[_IOC_NR(cmd)]); atomic_inc(&proc->stats.br[_IOC_NR(cmd)]); atomic_inc(&thread->stats.br[_IOC_NR(cmd)]); } } static int binder_put_node_cmd(struct binder_proc *proc, struct binder_thread *thread, void __user **ptrp, binder_uintptr_t node_ptr, binder_uintptr_t node_cookie, int node_debug_id, uint32_t cmd, const char *cmd_name) { void __user *ptr = *ptrp; if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); if (put_user(node_ptr, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); if (put_user(node_cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); binder_stat_br(proc, thread, cmd); binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s %d u%016llx c%016llx\n", proc->pid, thread->pid, cmd_name, node_debug_id, (u64)node_ptr, (u64)node_cookie); *ptrp = ptr; return 0; } static int binder_wait_for_work(struct binder_thread *thread, bool do_proc_work) { DEFINE_WAIT(wait); struct binder_proc *proc = thread->proc; int ret = 0; binder_inner_proc_lock(proc); for (;;) { prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE); if (binder_has_work_ilocked(thread, do_proc_work)) break; if (do_proc_work) list_add(&thread->waiting_thread_node, &proc->waiting_threads); binder_inner_proc_unlock(proc); schedule(); binder_inner_proc_lock(proc); list_del_init(&thread->waiting_thread_node); if (signal_pending(current)) { ret = -EINTR; break; } } finish_wait(&thread->wait, &wait); binder_inner_proc_unlock(proc); return ret; } /** * binder_apply_fd_fixups() - finish fd translation * @proc: binder_proc associated @t->buffer * @t: binder transaction with list of fd fixups * * Now that we are in the context of the transaction target * process, we can allocate and install fds. Process the * list of fds to translate and fixup the buffer with the * new fds first and only then install the files. * * If we fail to allocate an fd, skip the install and release * any fds that have already been allocated. */ static int binder_apply_fd_fixups(struct binder_proc *proc, struct binder_transaction *t) { struct binder_txn_fd_fixup *fixup, *tmp; int ret = 0; list_for_each_entry(fixup, &t->fd_fixups, fixup_entry) { int fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { binder_debug(BINDER_DEBUG_TRANSACTION, "failed fd fixup txn %d fd %d\n", t->debug_id, fd); ret = -ENOMEM; goto err; } binder_debug(BINDER_DEBUG_TRANSACTION, "fd fixup txn %d fd %d\n", t->debug_id, fd); trace_binder_transaction_fd_recv(t, fd, fixup->offset); fixup->target_fd = fd; if (binder_alloc_copy_to_buffer(&proc->alloc, t->buffer, fixup->offset, &fd, sizeof(u32))) { ret = -EINVAL; goto err; } } list_for_each_entry_safe(fixup, tmp, &t->fd_fixups, fixup_entry) { fd_install(fixup->target_fd, fixup->file); list_del(&fixup->fixup_entry); kfree(fixup); } return ret; err: binder_free_txn_fixups(t); return ret; } static int binder_thread_read(struct binder_proc *proc, struct binder_thread *thread, binder_uintptr_t binder_buffer, size_t size, binder_size_t *consumed, int non_block) { void __user *buffer = (void __user *)(uintptr_t)binder_buffer; void __user *ptr = buffer + *consumed; void __user *end = buffer + size; int ret = 0; int wait_for_proc_work; if (*consumed == 0) { if (put_user(BR_NOOP, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); } retry: binder_inner_proc_lock(proc); wait_for_proc_work = binder_available_for_proc_work_ilocked(thread); binder_inner_proc_unlock(proc); thread->looper |= BINDER_LOOPER_STATE_WAITING; trace_binder_wait_for_work(wait_for_proc_work, !!thread->transaction_stack, !binder_worklist_empty(proc, &thread->todo)); if (wait_for_proc_work) { if (!(thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED))) { binder_user_error("%d:%d ERROR: Thread waiting for process work before calling BC_REGISTER_LOOPER or BC_ENTER_LOOPER (state %x)\n", proc->pid, thread->pid, thread->looper); wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); } binder_set_nice(proc->default_priority); } if (non_block) { if (!binder_has_work(thread, wait_for_proc_work)) ret = -EAGAIN; } else { ret = binder_wait_for_work(thread, wait_for_proc_work); } thread->looper &= ~BINDER_LOOPER_STATE_WAITING; if (ret) return ret; while (1) { uint32_t cmd; struct binder_transaction_data_secctx tr; struct binder_transaction_data *trd = &tr.transaction_data; struct binder_work *w = NULL; struct list_head *list = NULL; struct binder_transaction *t = NULL; struct binder_thread *t_from; size_t trsize = sizeof(*trd); binder_inner_proc_lock(proc); if (!binder_worklist_empty_ilocked(&thread->todo)) list = &thread->todo; else if (!binder_worklist_empty_ilocked(&proc->todo) && wait_for_proc_work) list = &proc->todo; else { binder_inner_proc_unlock(proc); /* no data added */ if (ptr - buffer == 4 && !thread->looper_need_return) goto retry; break; } if (end - ptr < sizeof(tr) + 4) { binder_inner_proc_unlock(proc); break; } w = binder_dequeue_work_head_ilocked(list); if (binder_worklist_empty_ilocked(&thread->todo)) thread->process_todo = false; switch (w->type) { case BINDER_WORK_TRANSACTION: { binder_inner_proc_unlock(proc); t = container_of(w, struct binder_transaction, work); } break; case BINDER_WORK_RETURN_ERROR: { struct binder_error *e = container_of( w, struct binder_error, work); WARN_ON(e->cmd == BR_OK); binder_inner_proc_unlock(proc); if (put_user(e->cmd, (uint32_t __user *)ptr)) return -EFAULT; cmd = e->cmd; e->cmd = BR_OK; ptr += sizeof(uint32_t); binder_stat_br(proc, thread, cmd); } break; case BINDER_WORK_TRANSACTION_COMPLETE: case BINDER_WORK_TRANSACTION_PENDING: case BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT: { if (proc->oneway_spam_detection_enabled && w->type == BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT) cmd = BR_ONEWAY_SPAM_SUSPECT; else if (w->type == BINDER_WORK_TRANSACTION_PENDING) cmd = BR_TRANSACTION_PENDING_FROZEN; else cmd = BR_TRANSACTION_COMPLETE; binder_inner_proc_unlock(proc); kfree(w); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); binder_stat_br(proc, thread, cmd); binder_debug(BINDER_DEBUG_TRANSACTION_COMPLETE, "%d:%d BR_TRANSACTION_COMPLETE\n", proc->pid, thread->pid); } break; case BINDER_WORK_NODE: { struct binder_node *node = container_of(w, struct binder_node, work); int strong, weak; binder_uintptr_t node_ptr = node->ptr; binder_uintptr_t node_cookie = node->cookie; int node_debug_id = node->debug_id; int has_weak_ref; int has_strong_ref; void __user *orig_ptr = ptr; BUG_ON(proc != node->proc); strong = node->internal_strong_refs || node->local_strong_refs; weak = !hlist_empty(&node->refs) || node->local_weak_refs || node->tmp_refs || strong; has_strong_ref = node->has_strong_ref; has_weak_ref = node->has_weak_ref; if (weak && !has_weak_ref) { node->has_weak_ref = 1; node->pending_weak_ref = 1; node->local_weak_refs++; } if (strong && !has_strong_ref) { node->has_strong_ref = 1; node->pending_strong_ref = 1; node->local_strong_refs++; } if (!strong && has_strong_ref) node->has_strong_ref = 0; if (!weak && has_weak_ref) node->has_weak_ref = 0; if (!weak && !strong) { binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d:%d node %d u%016llx c%016llx deleted\n", proc->pid, thread->pid, node_debug_id, (u64)node_ptr, (u64)node_cookie); rb_erase(&node->rb_node, &proc->nodes); binder_inner_proc_unlock(proc); binder_node_lock(node); /* * Acquire the node lock before freeing the * node to serialize with other threads that * may have been holding the node lock while * decrementing this node (avoids race where * this thread frees while the other thread * is unlocking the node after the final * decrement) */ binder_node_unlock(node); binder_free_node(node); } else binder_inner_proc_unlock(proc); if (weak && !has_weak_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, node_cookie, node_debug_id, BR_INCREFS, "BR_INCREFS"); if (!ret && strong && !has_strong_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, node_cookie, node_debug_id, BR_ACQUIRE, "BR_ACQUIRE"); if (!ret && !strong && has_strong_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, node_cookie, node_debug_id, BR_RELEASE, "BR_RELEASE"); if (!ret && !weak && has_weak_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, node_cookie, node_debug_id, BR_DECREFS, "BR_DECREFS"); if (orig_ptr == ptr) binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d:%d node %d u%016llx c%016llx state unchanged\n", proc->pid, thread->pid, node_debug_id, (u64)node_ptr, (u64)node_cookie); if (ret) return ret; } break; case BINDER_WORK_DEAD_BINDER: case BINDER_WORK_DEAD_BINDER_AND_CLEAR: case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: { struct binder_ref_death *death; uint32_t cmd; binder_uintptr_t cookie; death = container_of(w, struct binder_ref_death, work); if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) cmd = BR_CLEAR_DEATH_NOTIFICATION_DONE; else cmd = BR_DEAD_BINDER; cookie = death->cookie; binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION, "%d:%d %s %016llx\n", proc->pid, thread->pid, cmd == BR_DEAD_BINDER ? "BR_DEAD_BINDER" : "BR_CLEAR_DEATH_NOTIFICATION_DONE", (u64)cookie); if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) { binder_inner_proc_unlock(proc); kfree(death); binder_stats_deleted(BINDER_STAT_DEATH); } else { binder_enqueue_work_ilocked( w, &proc->delivered_death); binder_inner_proc_unlock(proc); } if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); if (put_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); binder_stat_br(proc, thread, cmd); if (cmd == BR_DEAD_BINDER) goto done; /* DEAD_BINDER notifications can cause transactions */ } break; case BINDER_WORK_FROZEN_BINDER: { struct binder_ref_freeze *freeze; struct binder_frozen_state_info info; memset(&info, 0, sizeof(info)); freeze = container_of(w, struct binder_ref_freeze, work); info.is_frozen = freeze->is_frozen; info.cookie = freeze->cookie; freeze->sent = true; binder_enqueue_work_ilocked(w, &proc->delivered_freeze); binder_inner_proc_unlock(proc); if (put_user(BR_FROZEN_BINDER, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); if (copy_to_user(ptr, &info, sizeof(info))) return -EFAULT; ptr += sizeof(info); binder_stat_br(proc, thread, BR_FROZEN_BINDER); goto done; /* BR_FROZEN_BINDER notifications can cause transactions */ } break; case BINDER_WORK_CLEAR_FREEZE_NOTIFICATION: { struct binder_ref_freeze *freeze = container_of(w, struct binder_ref_freeze, work); binder_uintptr_t cookie = freeze->cookie; binder_inner_proc_unlock(proc); kfree(freeze); binder_stats_deleted(BINDER_STAT_FREEZE); if (put_user(BR_CLEAR_FREEZE_NOTIFICATION_DONE, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); if (put_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); binder_stat_br(proc, thread, BR_CLEAR_FREEZE_NOTIFICATION_DONE); } break; default: binder_inner_proc_unlock(proc); pr_err("%d:%d: bad work type %d\n", proc->pid, thread->pid, w->type); break; } if (!t) continue; BUG_ON(t->buffer == NULL); if (t->buffer->target_node) { struct binder_node *target_node = t->buffer->target_node; trd->target.ptr = target_node->ptr; trd->cookie = target_node->cookie; t->saved_priority = task_nice(current); if (t->priority < target_node->min_priority && !(t->flags & TF_ONE_WAY)) binder_set_nice(t->priority); else if (!(t->flags & TF_ONE_WAY) || t->saved_priority > target_node->min_priority) binder_set_nice(target_node->min_priority); cmd = BR_TRANSACTION; } else { trd->target.ptr = 0; trd->cookie = 0; cmd = BR_REPLY; } trd->code = t->code; trd->flags = t->flags; trd->sender_euid = from_kuid(current_user_ns(), t->sender_euid); t_from = binder_get_txn_from(t); if (t_from) { struct task_struct *sender = t_from->proc->tsk; trd->sender_pid = task_tgid_nr_ns(sender, task_active_pid_ns(current)); } else { trd->sender_pid = 0; } ret = binder_apply_fd_fixups(proc, t); if (ret) { struct binder_buffer *buffer = t->buffer; bool oneway = !!(t->flags & TF_ONE_WAY); int tid = t->debug_id; if (t_from) binder_thread_dec_tmpref(t_from); buffer->transaction = NULL; binder_cleanup_transaction(t, "fd fixups failed", BR_FAILED_REPLY); binder_free_buf(proc, thread, buffer, true); binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d %stransaction %d fd fixups failed %d/%d, line %d\n", proc->pid, thread->pid, oneway ? "async " : (cmd == BR_REPLY ? "reply " : ""), tid, BR_FAILED_REPLY, ret, __LINE__); if (cmd == BR_REPLY) { cmd = BR_FAILED_REPLY; if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); binder_stat_br(proc, thread, cmd); break; } continue; } trd->data_size = t->buffer->data_size; trd->offsets_size = t->buffer->offsets_size; trd->data.ptr.buffer = t->buffer->user_data; trd->data.ptr.offsets = trd->data.ptr.buffer + ALIGN(t->buffer->data_size, sizeof(void *)); tr.secctx = t->security_ctx; if (t->security_ctx) { cmd = BR_TRANSACTION_SEC_CTX; trsize = sizeof(tr); } if (put_user(cmd, (uint32_t __user *)ptr)) { if (t_from) binder_thread_dec_tmpref(t_from); binder_cleanup_transaction(t, "put_user failed", BR_FAILED_REPLY); return -EFAULT; } ptr += sizeof(uint32_t); if (copy_to_user(ptr, &tr, trsize)) { if (t_from) binder_thread_dec_tmpref(t_from); binder_cleanup_transaction(t, "copy_to_user failed", BR_FAILED_REPLY); return -EFAULT; } ptr += trsize; trace_binder_transaction_received(t); binder_stat_br(proc, thread, cmd); binder_debug(BINDER_DEBUG_TRANSACTION, "%d:%d %s %d %d:%d, cmd %u size %zd-%zd ptr %016llx-%016llx\n", proc->pid, thread->pid, (cmd == BR_TRANSACTION) ? "BR_TRANSACTION" : (cmd == BR_TRANSACTION_SEC_CTX) ? "BR_TRANSACTION_SEC_CTX" : "BR_REPLY", t->debug_id, t_from ? t_from->proc->pid : 0, t_from ? t_from->pid : 0, cmd, t->buffer->data_size, t->buffer->offsets_size, (u64)trd->data.ptr.buffer, (u64)trd->data.ptr.offsets); if (t_from) binder_thread_dec_tmpref(t_from); t->buffer->allow_user_free = 1; if (cmd != BR_REPLY && !(t->flags & TF_ONE_WAY)) { binder_inner_proc_lock(thread->proc); t->to_parent = thread->transaction_stack; t->to_thread = thread; thread->transaction_stack = t; binder_inner_proc_unlock(thread->proc); } else { binder_free_transaction(t); } break; } done: *consumed = ptr - buffer; binder_inner_proc_lock(proc); if (proc->requested_threads == 0 && list_empty(&thread->proc->waiting_threads) && proc->requested_threads_started < proc->max_threads && (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) /* the user-space code fails to */ /*spawn a new thread if we leave this out */) { proc->requested_threads++; binder_inner_proc_unlock(proc); binder_debug(BINDER_DEBUG_THREADS, "%d:%d BR_SPAWN_LOOPER\n", proc->pid, thread->pid); if (put_user(BR_SPAWN_LOOPER, (uint32_t __user *)buffer)) return -EFAULT; binder_stat_br(proc, thread, BR_SPAWN_LOOPER); } else binder_inner_proc_unlock(proc); return 0; } static void binder_release_work(struct binder_proc *proc, struct list_head *list) { struct binder_work *w; enum binder_work_type wtype; while (1) { binder_inner_proc_lock(proc); w = binder_dequeue_work_head_ilocked(list); wtype = w ? w->type : 0; binder_inner_proc_unlock(proc); if (!w) return; switch (wtype) { case BINDER_WORK_TRANSACTION: { struct binder_transaction *t; t = container_of(w, struct binder_transaction, work); binder_cleanup_transaction(t, "process died.", BR_DEAD_REPLY); } break; case BINDER_WORK_RETURN_ERROR: { struct binder_error *e = container_of( w, struct binder_error, work); binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered TRANSACTION_ERROR: %u\n", e->cmd); } break; case BINDER_WORK_TRANSACTION_PENDING: case BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT: case BINDER_WORK_TRANSACTION_COMPLETE: { binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered TRANSACTION_COMPLETE\n"); kfree(w); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); } break; case BINDER_WORK_DEAD_BINDER_AND_CLEAR: case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: { struct binder_ref_death *death; death = container_of(w, struct binder_ref_death, work); binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered death notification, %016llx\n", (u64)death->cookie); kfree(death); binder_stats_deleted(BINDER_STAT_DEATH); } break; case BINDER_WORK_NODE: break; case BINDER_WORK_CLEAR_FREEZE_NOTIFICATION: { struct binder_ref_freeze *freeze; freeze = container_of(w, struct binder_ref_freeze, work); binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered freeze notification, %016llx\n", (u64)freeze->cookie); kfree(freeze); binder_stats_deleted(BINDER_STAT_FREEZE); } break; default: pr_err("unexpected work type, %d, not freed\n", wtype); break; } } } static struct binder_thread *binder_get_thread_ilocked( struct binder_proc *proc, struct binder_thread *new_thread) { struct binder_thread *thread = NULL; struct rb_node *parent = NULL; struct rb_node **p = &proc->threads.rb_node; while (*p) { parent = *p; |